** Bartik employment growth for The Effects of Changes in Local Bank Health on Household Consumption
** by Daniel Cooper and Joe Peek, ReStat 2020
/* ////////////////////////////////////////////////////////////////////////// */
clear all
set more off
set maxvar 30000

/*This code uses BLS QCEW data (NAICS and SIC) to construct MSA-industry level employment 
data and then construct a bartik indicator 
	Raw data are downloaded from QCEW: https://www.bls.gov/cew/datatoc.htm

Deals with NAICS-SIC level splice. Extends SIC data forwards or NAICS data backwards based 
on the other's growth rates. (Need levels to calculate shares in Bartik calculation).
For growth rates, uses true levels of NAICS and SIC and does growth rate splice 
using OUR NAICS reconstructed 2000 consistent data. 

More details about this approach can be found in the online appendix although some are 
provided inline here. 
 

Code involves fairly significant SIC rearranging to match NAICS codes
	mostly grabs publishing out of manufacturing, information (data processing) out of 
	business services, and leasing out of business services. And combines categories.
	
We ignore unclassified industries

In instances of holes in the data due to nondisclosures we use linear approximation before and after. 
If data for an industry starts or ends 'N', than apply procedure to the start or end. 
Assign zero if always 'N'.
	
Annual and quarterly versions of Bartik employment
	quarterly for PSID
	annual for equifax -- Equifax is at MSA level, not individual 

The same approach is used for reading in and cleaning the data at the national, state and MSA levels. */


***************************************************************
* The code is divided into blocks for ease of execution based on 
* the level of aggregation of the data.
***************************************************************
scalar national = 0
	scalar nat_naic = 0
	scalar nat_sic = 0
	scalar nat_merge = 0

scalar state = 0
	scalar st_naic_data = 0
	scalar st_sic_data = 0
	scalar merge_st = 0

scalar msa = 0
	scalar naics = 0
	scalar sic = 0
	scalar merge_msa = 0
	
scalar weights = 0
	scalar state_weights = 0
	scalar msa_weights = 0
********************************************************************************

* update the line below with the relevant path
local data "/shared/Joe_Peek/bankhealth_nocamels/data"


* Read in industry level NAICS data.

*National data
if national {

if nat_naic {

forval y=1999/2016 {
* NOTE 2016 data folder has sub-industries with broader categories and file names different than in other years, thus procedure slightly different. 

	qui: fs "`data'/naic_county_files/`y'_qtrly_by_industry/`y'.q1-q4.by_industry/`y'.q1-q4*"
	
	if `y' != 2016 {
		*pulling data from csv format 
		foreach c in `r(files)' {
		
			if regexm("`c'", "[U][n][c][l][a][s][s]") != 1 {
				import delimited using "`data'/naic_county_files/`y'_qtrly_by_industry/`y'.q1-q4.by_industry/`c'", colrange(1:18) varnames(1) clear stringcols(1 4 8 9)
				keep area_fips area_title industry_code year qtr month1_emplvl month2_emplvl month3_emplvl disclosure_code own_title agglvl_code qtrly_estabs_count own_title own_code industry_title
			
				*keep national 
				keep if area_fips == "US000"
				
				*make sure industry-code is a string so everything will match
				tostring industry_code, replace
				
				*save
				if regexm("`c'", "[0-9] [R][e][t][a][i][l] [t][r][a][d][e]") == 1 | regexm("`c'", "[3][3] [M][a][n][u][f][a][c][t][u][r][i][n][g]") == 1 | regexm("`c'", "[4][9] [T][r]") == 1 {
					local filest = substr("`c'", 12, 5)
				}
				if regexm("`c'", "[4] [1-9][0-6] [A-Z]") == 1 {
					local filest = substr("`c'", 12, 2)
				}
				
				save "`data'/naic_county_files/`filest'_nat_`y'", replace
			}
		}
	}
	
	qui: fs "`data'/naic_county_files/`y'_qtrly_by_industry/`y'.q1-q4.by_industry/`y'.q1-q4*"
	
	if `y' == 2016 {
	
		*pulling data from csv format 
		foreach c in `r(files)' {
		
			local c_mod = subinstr("`c'", " NAICS", "", 1)

			if regexm("`c_mod'", "[U][n][c][l][a][s][s]") != 1 {
				import delimited using "`data'/naic_county_files/`y'_qtrly_by_industry/`y'.q1-q4.by_industry/`c'", colrange(1:18) varnames(1) clear stringcols(1 4 8 9)
				keep area_fips area_title industry_code year qtr month1_emplvl month2_emplvl month3_emplvl disclosure_code own_title agglvl_code qtrly_estabs_count own_title own_code industry_title
			
				*keep national 
				keep if area_fips == "US000"
		
				*make sure industry-code is a string so everything will match
				tostring industry_code, replace
				
				*save
				if regexm("`c_mod'", "[0-9] [R][e][t][a][i][l] [t][r][a][d][e]") == 1 | regexm("`c'", "[3][3] [M][a][n][u][f][a][c][t][u][r][i][n][g]") == 1 | regexm("`c'", "[4][9] [T][r]") == 1 {
					local filest = substr("`c'", 12, 5)
					
					save "`data'/naic_county_files/`filest'_nat_`y'", replace
					
				}
				if regexm("`c_mod'", " [1-9][0-6] [A-Z]") == 1 {
					local filest = substr("`c'", 12, 2)
					
					save "`data'/naic_county_files/`filest'_nat_`y'", replace

				}
			}
		}	
	}
}


forval y=1999/2016{
*appending data
use "`data'/naic_county_files/11_nat_`y'", replace

foreach ind in 21 22 23 31-33 42 44-45 48-49 51 52 53 54 55 56 61 62 71 72 81 92 {
	append using "`data'/naic_county_files/`ind'_nat_`y'"
	}

tempfile naic_`y'
save "`naic_`y''"

}


use "`naic_1999'"

forval y=2000/2016 {
 
	append using "`naic_`y''"

}

save `data'/bartik_naic_nat_pre.dta, replace



* DEAL WITH NONDISCLOSURES IN THE NAICS DATA and do some additional cleaning/labeling

g date_quarter = yq(year, qtr)
format date_quarter %tq

**nondisclosures: avg if beg and end, extend if just one, zero if none
bys area_fips industry_code own_code year qtr: g nondis = 1 if disclosure_code == ""

forval i = 1/3 {
	replace month`i'_emplvl = . if disclosure_code != ""
}

*linear interpolation if have real values on either side
sort area_fips industry_code own_code date_quarter
forval i = 1/3 {
	by area_fips industry_code own_code: ipolate month`i'_emplvl date_quarter, g(month`i')
	replace month`i'_emplvl = month`i' if month`i'_emplvl == . 
}

*extend values if have a real value on either side 
forval i = 1/3 {
	sort area_fips industry_code own_code date_quarter
	by area_fips industry_code own_code: replace month`i'_emplvl = month`i'_emplvl[_n-1] if month`i'_emplvl == . 
	gsort area_fips industry_code own_code -date_quarter
	by area_fips industry_code own_code: replace month`i'_emplvl = month`i'_emplvl[_n-1] if month`i'_emplvl == . 
}

*otherwise, nondisclosures are zeroes because no data we can use
forval i = 1/3 {
	replace month`i'_emplvl = 0 if month`i'_emplvl == . 
}

*aggregate (sum) by own_code
	*own codes: private, government etc. 
forval i = 1/3 {
	bysort area_fips industry_code year qtr: ereplace month`i'_emplvl = sum(month`i'_emplvl)
}
			
*Get rid of repetitive observations
collapse (max) month1_emplvl month2_emplvl month3_emplvl (first) area_title area_fips industry_title agglvl_code disclosure_code own_code own_title , by(industry_code date_quarter) 


**industry

g ind_agg = .
replace ind_agg = 1 if industry_code == "11" 
replace ind_agg = 2 if industry_code == "21"
replace ind_agg = 3 if industry_code == "22"
replace ind_agg = 4 if industry_code == "23"
replace ind_agg = 5 if industry_code == "31-33"
replace ind_agg = 6 if industry_code == "42" | industry_code == "44-45"
replace ind_agg = 7 if industry_code == "48-49"
replace ind_agg = 8 if industry_code == "51"
replace ind_agg = 9 if industry_code == "52" | industry_code == "53"
replace ind_agg = 10  if industry_code == "54" | industry_code == "55" | industry_code == "56"
replace ind_agg = 11 if industry_code == "61" | industry_code == "62"
replace ind_agg = 12 if industry_code == "71" | industry_code == "72"
replace ind_agg = 13 if industry_code == "81"
replace ind_agg = 14 if industry_code == "92"
*replace ind_agg = . if industry_code == "99"
replace ind_agg = 15 if industry_code == "10"
	*total 

#delimit ; 

label define industry
1 "Agriculture, forestry, fishing, and hunting"
2 "Mining" 
3 "Utilities"
4 "Construction" 
5 "Manufacturing" 
6 "Retail and Wholesale Trade"
7 "Transportation and warehousing"
8 "Information"
9 "Finance, Insurance, Real Estate, Rental, and Leasing"
10 "Professional and Business Services"
11 "Educational Services, Health Care, and Social Assistance"
12 "Arts, Entertainment, Recreation, Accomodation, and Food Services"
13 "Other services, except Government"
14 "Government"
15 "Total"
;
#delimit cr 

drop if ind_agg == . 

label values ind_agg industry


*collapse by industries 
*mark if all the employment levels in an area-industry-time group are missing (if the first one is missing then all are since . sorted to last)
	*if one component of the industry is missing, then just treat as zero 
bys area_fips ind_agg date_quarter: g allmissing = mi(month1_emplvl[1])
collapse (sum) month1_emplvl month2_emplvl month3_emplvl (min) allmissing, by(area_fips ind_agg date_quarter)
*get the missings back -- collapse makes zero 
replace month1_emplvl= . if allmissing
replace month2_emplvl = . if allmissing
replace month3_emplvl = . if allmissing

*more cleaning

foreach v of varlist month* {
	rename `v' `v'_naic
}

count if month1_emplvl_naic==. | month2_emplvl_naic==. | month3_emplvl_naic==.

*quarterly figure: means of months 
egen total_emp_naic = rowmean(month1_emplvl_naic month2_emplvl_naic month3_emplvl_naic)

g state = 100
drop area_fips

save `data'/bartik_naic_nat_cleaned.dta, replace

}


* INPUT THE SIC-based data. 


if nat_sic {

forval y=1975/2000 {

	qui: fs "`data'/sic_files/sic_`y'_qtrly_by_industry/sic.`y'.q1-q4.by_industry/sic.`y'.q1-q4*"
	
	*pulling the data from csv format
	foreach c in `r(files)' {
	
		if regexm("`c'", "[0][A-K] \(") == 1 | regexm("`c'", "[0][I][7-8][0-9] \(") == 1 | regexm("`c'", "[0][E][4][0-9] \(") == 1 | regexm("`c'", "[0][I][7-8][0-9][0-9] \(") == 1 | regexm("`c'", "[0][I][7-8][0-9][0-9][0-9] \(") == 1 | regexm("`c'", "[0][E][4][7][0-9] \(") == 1 | regexm("`c'", "[0][D][2-3][0-9] \(") == 1 {
			import delimited using "`data'/sic_files/sic_`y'_qtrly_by_industry/sic.`y'.q1-q4.by_industry/`c'", varnames(1) colrange(1:17) clear
			keep area_fips area_title year qtr month1_emplvl month2_emplvl month3_emplvl industry_code industry_title disclosure_code agglvl_code own_code own_title
			
			*keeping US
			keep if area_fips == "US000"
							
			*make sure industry-code is a string so everything will match
			tostring industry_code, replace
			tostring disclosure_code, replace
		
			*save
			if regexm("`c'", "[0][A-K] \(") == 1 {
				local filest = substr("`c'", 16, 2)
			}
			if regexm("`c'", "[0][I][7-8][0-9] \(") == 1 | regexm("`c'", "[0][E][4][0-9] \(") == 1 | regexm("`c'", "[0][D][2-3][0-9] \(") == 1 {
				local filest = substr("`c'", 16, 4)
			}
			if regexm("`c'", "[0][I][7-8][0-9][0-9] \(") == 1 | regexm("`c'", "[0][E][4][7][0-9] \(") == 1 {
				local filest = substr("`c'", 16, 5)
			}
			if regexm("`c'", "[0][I][7-8][0-9][0-9][0-9] \(") == 1 {
				local filest = substr("`c'", 16, 6)
			}
			
			save "`data'/sic_files/`filest'_nat_`y'", replace
		}
	}
}

*append
forval y=1975/2000{

use "`data'/sic_files/0A_nat_`y'", clear

foreach letter in B C F G H J K D20 D21 D22 D23 D24 D25 D26 D27 D28 D29 D30 D31 D32 D33 D34 D35 D36 D37 D38 D39 I70 I72 I76 I79 I731 I732 I733 I734 I735 I736 I7370 I7371 I7372 I7373 I7374 I7375 I7376 I7377 I7378 I7379 I738 I751 I752 I754 I753 I754 I781 I782 I783 I784 I80 I81 I82 I83 I84 I86 I88 I89 E40 E41 E42 E43 E44 E45 E46 E472 E473 E474 E478 E48 E49 {
	capture: append using "`data'/sic_files/0`letter'_nat_`y'"
}
*need to capture because not all of the subcategories are in every year of data

tempfile sic_`y'
save `sic_`y''

}


use "`sic_1975'"


forval y=1976/2000 {

	append using "`sic_`y''"

}


save `data'/bartik_sic_nat_pre.dta, replace


* DEAL WITH DISCLOSURES IN THE SIC data.

use `data'/bartik_sic_nat_pre.dta, clear

g date_quarter = yq(year, qtr)
format date_quarter %tq


**nondisclosures: avg if beg and end, extend if just one, zero if none
bys area_fips industry_code own_code year qtr: g nondis = 1 if disclosure_code == ""

forval i = 1/3 {
	replace month`i'_emplvl = . if disclosure_code != ""
}

*linear interpolation if have real values on either side
sort area_fips industry_code own_code date_quarter
forval i = 1/3 {
	by area_fips industry_code own_code: ipolate month`i'_emplvl date_quarter, g(month`i')
	replace month`i'_emplvl = month`i' if month`i'_emplvl == . 
}

*extend values if have a real value on either side 
forval i = 1/3 {
	sort area_fips industry_code own_code date_quarter
	by area_fips industry_code own_code: replace month`i'_emplvl = month`i'_emplvl[_n-1] if month`i'_emplvl == . 
	gsort area_fips industry_code own_code -date_quarter
	by area_fips industry_code own_code: replace month`i'_emplvl = month`i'_emplvl[_n-1] if month`i'_emplvl == . 
}

*otherwise, nondisclosures are zeroes because no data we can use
forval i = 1/3 {
	replace month`i'_emplvl = 0 if month`i'_emplvl == . 
}

*aggregate by own_code
forval i = 1/3 {
	bysort area_fips industry_code year qtr: ereplace month`i'_emplvl = sum(month`i'_emplvl)
}
			
*Get rid of repetitive observations
collapse (max) month1_emplvl month2_emplvl month3_emplvl (first) area_title  industry_title agglvl_code disclosure_code own_code own_title , by(area_fips industry_code date_quarter) 

*industries:: rearrange to match NAIC 
g ind_agg = .
replace ind_agg = 1 if industry_code == "SIC_0A" 
replace ind_agg = 2 if industry_code == "SIC_0B"
replace ind_agg = 4 if industry_code == "SIC_0C"
replace ind_agg = 5 if industry_code == "SIC_0D20" | regexm(industry_code, "[0][D][2][1-6]$") == 1 | regexm(industry_code, "[0][D][2][8-9]$") == 1 | regexm(industry_code, "[0][D][3][0-9]$") == 1 
replace ind_agg = 7 if industry_code == "SIC_0E40" | industry_code == "SIC_0E46" | industry_code == "SIC_0E41" | industry_code == "SIC_0E42" | industry_code == "SIC_0E44" | industry_code == "SIC_0E45" | industry_code == "SIC_0E472" | industry_code == "SIC_0E473"  | industry_code == "SIC_0E478"
replace ind_agg = 3 if industry_code == "SIC_0E49" 
replace ind_agg = 8 if industry_code == "SIC_0E48" | industry_code == "SIC_0I78" | industry_code == "SIC_0D27" | industry_code == "SIC_0I7370" | industry_code == "SIC_0I7371" | industry_code == "SIC_0I7372" | industry_code == "SIC_0I7373"  | industry_code == "SIC_0I7374"  | industry_code == "SIC_0I7375"  | industry_code == "SIC_0I7376" | industry_code == "SIC_0I7379" | industry_code == "SIC_0D27" 
replace ind_agg = 6 if industry_code == "SIC_0F" | industry_code == "SIC_0G"
replace ind_agg = 9 if industry_code == "SIC_0H" | industry_code == "SIC_0I735" | industry_code == "SIC_0I7377" | industry_code == "SIC_0I751" | industry_code == "SIC_0I784" | industry_code == "SIC_0E474"
replace ind_agg = 13 if industry_code == "SIC_0I72" | industry_code == "SIC_0I86" | industry_code == "SIC_0I752" | industry_code == "SIC_0I753" | industry_code == "SIC_0I754" | industry_code == "SIC_0I76" | industry_code == "SIC_0I88" | industry_code == "SIC_0I89" | industry_code == "SIC_0I7378"
replace ind_agg = 12 if industry_code == "SIC_0I70" | industry_code == "SIC_0I79" | industry_code == "SIC_0I84" | industry_code == "SIC_0I781" | industry_code == "SIC_0I782" | industry_code == "SIC_0I783"   
replace ind_agg = 10 if industry_code == "SIC_0I81" | industry_code == "SIC_0I731" | industry_code == "SIC_0I732" | industry_code == "SIC_0I733" | industry_code == "SIC_0I734" | industry_code == "SIC_0I736" | industry_code == "SIC_0I738" 
replace ind_agg = 11 if industry_code == "SIC_0I80"  | industry_code == "SIC_0I82" | industry_code == "SIC_0I83" 
replace ind_agg = 14 if industry_code == "SIC_0J" | industry_code == "SIC_0E43"
replace ind_agg = 15 if industry_code == "SIC_0Z"
	*total 
replace ind_agg = . if industry_code == "SIC_0K"

drop if ind_agg == . 

#delimit ; 

label define industry
1 "Agriculture, forestry, fishing, and hunting"
2 "Mining" 
3 "Utilities"
4 "Construction" 
5 "Manufacturing" 
6 "Retail and Wholesale Trade"
7 "Transportation and warehousing"
8 "Information"
9 "Finance, Insurance, Real Estate, Rental, and Leasing"
10 "Professional and Business Services"
11 "Educational Services, Health Care, and Social Assistance"
12 "Arts, Entertainment, Recreation, Accommodation, and Food Services"
13 "Other services, except Government"
14 "Government"
15 "Total"
;

#delimit cr

label values ind_agg industry

*collapse by industries 
*mark if all the employment levels in a area-industry-time group are missing (if the first one is missing then all are since . sorted to last)
	*if one component of the industry is missing, then just treat as zero 
bys area_fips ind_agg date_quarter: g allmissing = mi(month1_emplvl[1])
collapse (sum) month1_emplvl month2_emplvl month3_emplvl (min) allmissing, by(area_fips ind_agg date_quarter)
*get the missings back -- collapse makes zero 
replace month1_emplvl= . if allmissing
replace month2_emplvl = . if allmissing
replace month3_emplvl = . if allmissing

*more cleaning
rename month1_emplvl month1_emplvl_sic
rename month2_emplvl month2_emplvl_sic
rename month3_emplvl month3_emplvl_sic
	
egen total_emp_sic = rowmean(month1_emplvl_sic month2_emplvl_sic month3_emplvl_sic)

g state = 100
drop area_fips

save `data'/bartik_sic_nat_cleaned.dta, replace

}



*MERGE SIC AND NAICS data by industry and time and splice the data together


if nat_merge {

use `data'/bartik_sic_nat_cleaned.dta, clear

merge 1:1 date_quarter ind_agg using `data'/bartik_naic_nat_cleaned.dta

g year = yofd(dofq(date_quarter))
gen qtr = quarter(dofq(date_quarter))

xtset ind_agg date_quarter

*replace missings with zeros (all from non merges)
replace total_emp_sic = 0 if total_emp_sic ==. 
replace total_emp_naic = 0 if total_emp_naic ==. 

*generate SIC growth rates
g sic_gr = (total_emp_sic - L.total_emp_sic) / L.total_emp_sic
replace sic_gr = 0 if sic_gr == . | sic_gr == -1

*generate NAIC growth rates
g naic_gr = (total_emp_naic - L.total_emp_naic) / L.total_emp_naic
replace naic_gr = 0 if naic_gr == . | naic_gr == -1

*take NAIC 2001 level and grow it backwards according to SIC growth rates
g naic_back = total_emp_naic if year == 2001 & qtr == 1 
count if naic_back == . & date_quarter < 164 
while `r(N)' != 0 {
	bysort ind_agg (date_quarter): replace naic_back = naic_back[_n+1]/(1+sic_gr[_n+1]) if date_quarter < 164
	count if naic_back == . & date_quarter < 164
}

*splice
g total_emp_back = naic_back if year <= 2000
replace total_emp_back = total_emp_naic if year > 2000
la var total_emp_back "NAICS Backwards"

*alternative: take SIC 2000q4 level and grow it forwards according to NAIC growth rates
g sic_forward = total_emp_sic if year == 2000 & qtr == 4 
count if sic_forward ==. & date_quarter > 163 
while `r(N)' != 0 {
	bysort ind_agg (date_quarter): replace sic_forward = sic_forward[_n-1]*(1+naic_gr[_n]) if date_quarter > 163
	count if sic_forward == . & date_quarter > 163
}

*splice
g total_emp_for = total_emp_sic if year <= 2000
replace total_emp_for = sic_forward if year > 2000
la var total_emp_for "SIC Forwards"

*ratios of actual versus created: use in state and MSA splices 
g ratio_naic = total_emp_sic / naic_back
g ratio_sic = total_emp_naic / sic_forward

keep date_quarter state total_emp* ind_agg naic_back sic_forward ratio*

save `data'/bartik_nat_merged.dta, replace


}
}

*State Data
if state {
if st_naic_data {

forval y=1999/2016 {
*2016: data folder has sub-industries with broader categories and file names different. Thus download procedure different so don't save a file for each subcategory 

	qui: fs "`data'/naic_county_files/`y'_qtrly_by_industry/`y'.q1-q4.by_industry/`y'.q1-q4*"

	if `y' != 2016 {
	
		*pulling data from csv format 
		foreach c in `r(files)' {
			if regexm("`c'", "[U][n][c][l][a][s][s]") != 1 & regexm("`c'", "[1][0] [T][o][t][a][l]") != 1  {
				import delimited using "`data'/naic_county_files/`y'_qtrly_by_industry/`y'.q1-q4.by_industry/`c'", colrange(1:18) varnames(1) clear stringcols(1 4 8 9)
				keep area_fips area_title industry_code year qtr month1_emplvl month2_emplvl month3_emplvl disclosure_code own_title agglvl_code qtrly_estabs_count own_title own_code industry_title
			
				*keeping the states
				keep if agglvl_code=="54" | agglvl_code == "50"

				drop if area_fips=="US000"
		
				*make sure industry-code is a string so everything will match
				tostring industry_code, replace
				
				*save
				if regexm("`c'", "[0-9] [R][e][t][a][i][l] [t][r][a][d][e]") == 1 | regexm("`c'", "[3][3] [M][a][n][u][f][a][c][t][u][r][i][n][g]") == 1 | regexm("`c'", "[4][9] [T][r]") == 1 {
					local filest = substr("`c'", 12, 5)
				}
				if regexm("`c'", "[4] [1-9][0-6] [A-Z]") == 1 {
					local filest = substr("`c'", 12, 2)
				}

				save "`data'/naic_county_files/`filest'_st_`y'", replace
			}
		}	
	}
	
	if `y' == 2016 {
	
		*pulling data from csv format 
		foreach c in `r(files)' {
			local c_mod = subinstr("`c'", " NAICS", "", 1)

			if regexm("`c_mod'", "[U][n][c][l][a][s][s]") != 1 & regexm("`c'", "[1][0] [T][o][t][a][l]") != 1  {
				import delimited using "`data'/naic_county_files/`y'_qtrly_by_industry/`y'.q1-q4.by_industry/`c'", colrange(1:18) varnames(1) clear stringcols(1 4 8 9)
				keep area_fips area_title industry_code year qtr month1_emplvl month2_emplvl month3_emplvl disclosure_code own_title agglvl_code qtrly_estabs_count own_title own_code industry_title
			
				*keeping the states
				keep if agglvl_code=="54" | agglvl_code == "50"

				drop if area_fips=="US000"
		
				*make sure industry-code is a string so everything will match
				tostring industry_code, replace
				
				*save
				if regexm("`c_mod'", "[0-9] [R][e][t][a][i][l] [t][r][a][d][e]") == 1 | regexm("`c'", "[3][3] [M][a][n][u][f][a][c][t][u][r][i][n][g]") == 1 | regexm("`c'", "[4][9] [T][r]") == 1 {
					local filest = substr("`c'", 12, 5)
					
					save "`data'/naic_county_files/`filest'_st_`y'", replace

				}
				if regexm("`c_mod'", " [1-9][0-6] [A-Z]") == 1 {
					local filest = substr("`c'", 12, 2)
					
					save "`data'/naic_county_files/`filest'_st_`y'", replace
				}
			}
		}	
	}
}


forval y=1999/2016 {
*appending data
use "`data'/naic_county_files/11_st_`y'", replace

foreach ind in 21 22 23 31-33 42 44-45 48-49 51 52 53 54 55 56 61 62 71 72 81 92 {
	append using "`data'/naic_county_files/`ind'_st_`y'"
}


tempfile naic_`y'
save "`naic_`y''"

}


use "`naic_1999'"

forval y=2000/2016 {
 
	append using "`naic_`y''"

}

save `data'/bartik_naic_st_pre.dta, replace

g date_quarter = yq(year, qtr)
format date_quarter %tq

**nondisclosures: avg if beg and end, extend if just one, zero if none
bys area_fips industry_code own_code year qtr: g nondis = 1 if disclosure_code == ""

forval i = 1/3 {
	replace month`i'_emplvl = . if disclosure_code != ""
}

*linear interpolation if have real values on either side
sort area_fips industry_code own_code date_quarter
forval i = 1/3 {
	by area_fips industry_code own_code: ipolate month`i'_emplvl date_quarter, g(month`i')
	replace month`i'_emplvl = month`i' if month`i'_emplvl == . 
}

*extend values if have a real value on either side 
forval i = 1/3 {
	sort area_fips industry_code own_code date_quarter
	by area_fips industry_code own_code: replace month`i'_emplvl = month`i'_emplvl[_n-1] if month`i'_emplvl == . 
	gsort area_fips industry_code own_code -date_quarter
	by area_fips industry_code own_code: replace month`i'_emplvl = month`i'_emplvl[_n-1] if month`i'_emplvl == . 
}

*otherwise, nondisclosures are zeroes because no data we can use
forval i = 1/3 {
	replace month`i'_emplvl = 0 if month`i'_emplvl == . 
}

*aggregate by own_code
forval i = 1/3 {
	bysort area_fips industry_code year qtr: ereplace month`i'_emplvl = sum(month`i'_emplvl)
}
			
*Get rid of repetitive observations
collapse (max) month1_emplvl month2_emplvl month3_emplvl (first) area_title industry_title agglvl_code disclosure_code own_code own_title , by(area_fips industry_code date_quarter) 

**industry

g ind_agg = .
replace ind_agg = 1 if industry_code == "11" 
replace ind_agg = 2 if industry_code == "21"
replace ind_agg = 3 if industry_code == "22"
replace ind_agg = 4 if industry_code == "23"
replace ind_agg = 5 if industry_code == "31-33"
replace ind_agg = 6 if industry_code == "42" | industry_code == "44-45"
replace ind_agg = 7 if industry_code == "48-49"
replace ind_agg = 8 if industry_code == "51"
replace ind_agg = 9 if industry_code == "52" | industry_code == "53"
replace ind_agg = 10  if industry_code == "54" | industry_code == "55" | industry_code == "56"
replace ind_agg = 11 if industry_code == "61" | industry_code == "62"
replace ind_agg = 12 if industry_code == "71" | industry_code == "72"
replace ind_agg = 13 if industry_code == "81"
replace ind_agg = 14 if industry_code == "92"
*replace ind_agg = . if industry_code == "99"
replace ind_agg = 15 if industry_code == "10"
	*total 

#delimit ; 

label define industry
1 "Agriculture, forestry, fishing, and hunting"
2 "Mining" 
3 "Utilities"
4 "Construction" 
5 "Manufacturing" 
6 "Retail and Wholesale Trade"
7 "Transportation and warehousing"
8 "Information"
9 "Finance, Insurance, Real Estate, Rental, and Leasing"
10 "Professional and Business Services"
11 "Educational Services, Health Care, and Social Assistance"
12 "Arts, Entertainment, Recreation, Accomodation, and Food Services"
13 "Other services, except Government"
14 "Government"
15 "Total"
;
#delimit cr 

drop if ind_agg == . 

label values ind_agg industry

*collapse by industries 
*mark if all the employment levels in a area-industry-time group are missing (if the first one is missing then all are since . sorted to last)
	*if one component of the industry is missing, then just treat as zero 
bys area_fips ind_agg date_quarter: g allmissing = mi(month1_emplvl[1])
collapse (sum) month1_emplvl month2_emplvl month3_emplvl (min) allmissing, by(area_fips ind_agg date_quarter)
*get the missings back -- collapse makes zero 
replace month1_emplvl= . if allmissing
replace month2_emplvl = . if allmissing
replace month3_emplvl = . if allmissing

*more cleaning

destring area_fips, g(areafips)

foreach v of varlist month* {
	rename `v' `v'_naic
}

count if month1_emplvl_naic==. | month2_emplvl_naic==. | month3_emplvl_naic==.

*creating state code
g fipstate = substr(area_fips, 1, 2)
destring fipstate, replace

*crosswalk to PSID codes 
g state=fipstate if fipstate==1
replace state=50 if fipstate==2
replace state=fipstate-2 if fipstate<=6 & fipstate>2
replace state=fipstate-3 if fipstate<15 & fipstate>6
replace state=51 if fipstate==15
replace state=fipstate-5 if fipstate<43 & fipstate>15
replace state=fipstate-6 if fipstate>43 & fipstate<52
replace state=fipstate-7 if fipstate>52 & fipstate<=56

drop if state==. //this is just U.S. territories and other random stuff (like the FBI - undesignated location)
drop if state == 57 
egen total_emp_naic = rowmean(month1_emplvl_naic month2_emplvl_naic month3_emplvl_naic)

save `data'/bartik_naic_st_cleaned.dta, replace

}

if st_sic_data {

forval y=1975/2000 {

	qui: fs "`data'/sic_files/sic_`y'_qtrly_by_industry/sic.`y'.q1-q4.by_industry/sic.`y'.q1-q4*"
	
	*pulling the data from csv format
	foreach c in `r(files)' {
	
		if regexm("`c'", "[0][A-K] \(") == 1 | regexm("`c'", "[0][I][7-8][0-9] \(") == 1 | regexm("`c'", "[0][E][4][0-9] \(") == 1 | regexm("`c'", "[0][I][7-8][0-9][0-9] \(") == 1 | regexm("`c'", "[0][I][7-8][0-9][0-9][0-9] \(") == 1 | regexm("`c'", "[0][E][4][7][0-9] \(") == 1 | regexm("`c'", "[0][D][2-3][0-9] \(") == 1 {
			import delimited using "`data'/sic_files/sic_`y'_qtrly_by_industry/sic.`y'.q1-q4.by_industry/`c'", varnames(1) colrange(1:17) clear
			keep area_fips area_title year qtr month1_emplvl month2_emplvl month3_emplvl industry_code industry_title disclosure_code agglvl_code own_code own_title
			
			*keeping states
			keep if agglvl_code==20 | agglvl_code == 21 | agglvl_code == 18 | agglvl_code == 22 | agglvl_code == 23
						
			*make sure industry-code is a string so everything will match
			tostring industry_code, replace
			tostring disclosure_code, replace
	
			*save
			if regexm("`c'", "[0][A-K] \(") == 1 {
				local filest = substr("`c'", 16, 2)
			}
			if regexm("`c'", "[0][I][7-8][0-9] \(") == 1 | regexm("`c'", "[0][E][4][0-9] \(") == 1 | regexm("`c'", "[0][D][2-3][0-9] \(") == 1 {
				local filest = substr("`c'", 16, 4)
			}
			if regexm("`c'", "[0][I][7-8][0-9][0-9] \(") == 1 | regexm("`c'", "[0][E][4][7][0-9] \(") == 1 {
				local filest = substr("`c'", 16, 5)
			}
			if regexm("`c'", "[0][I][7-8][0-9][0-9][0-9] \(") == 1 {
				local filest = substr("`c'", 16, 6)
			}
			
			save "`data'/sic_files/`filest'_st_`y'", replace
		}
	}
}

*append
forval y=1975/2000{

use "`data'/sic_files/0A_st_`y'", clear

foreach letter in B C F G H J K D20 D21 D22 D23 D24 D25 D26 D27 D28 D29 D30 D31 D32 D33 D34 D35 D36 D37 D38 D39 I70 I72 I76 I79 I731 I732 I733 I734 I735 I736 I7370 I7371 I7372 I7373 I7374 I7375 I7376 I7377 I7378 I7379 I738 I751 I752 I754 I753 I754 I781 I782 I783 I784 I80 I81 I82 I83 I84 I86 I88 I89 E40 E41 E42 E43 E44 E45 E46 E472 E473 E474 E478 E48 E49 {
	capture: append using "`data'/sic_files/0`letter'_st_`y'"
}

tempfile sic_`y'
save `sic_`y''

}


use "`sic_1975'"


forval y=1976/2000 {

	append using "`sic_`y''"

}
save `data'/bartik_sic_st_pre.dta, replace

use `data'/bartik_sic_st_pre.dta, clear


g date_quarter = yq(year, qtr)
format date_quarter %tq

**nondisclosures: avg if beg and end, extend if just one, zero if none
bys area_fips industry_code own_code year qtr: g nondis = 1 if disclosure_code == ""

forval i = 1/3 {
	replace month`i'_emplvl = . if disclosure_code != ""
}

*linear interpolation if have real values on either side
sort area_fips industry_code own_code date_quarter
forval i = 1/3 {
	by area_fips industry_code own_code: ipolate month`i'_emplvl date_quarter, g(month`i')
	replace month`i'_emplvl = month`i' if month`i'_emplvl == . 
}

*extend values if have a real value on either side 
forval i = 1/3 {
	sort area_fips industry_code own_code date_quarter
	by area_fips industry_code own_code: replace month`i'_emplvl = month`i'_emplvl[_n-1] if month`i'_emplvl == . 
	gsort area_fips industry_code own_code -date_quarter
	by area_fips industry_code own_code: replace month`i'_emplvl = month`i'_emplvl[_n-1] if month`i'_emplvl == . 
}

*otherwise, nondisclosures are zeroes because no data we can use
forval i = 1/3 {
	replace month`i'_emplvl = 0 if month`i'_emplvl == . 
}

*aggregate by own_code
forval i = 1/3 {
	bysort area_fips industry_code year qtr: ereplace month`i'_emplvl = sum(month`i'_emplvl)
}
			
*Get rid of repetitive observations
collapse (max) month1_emplvl month2_emplvl month3_emplvl (first) area_title industry_title agglvl_code disclosure_code own_code own_title , by(area_fips industry_code date_quarter) 


*industries
g ind_agg = .
replace ind_agg = 1 if industry_code == "SIC_0A" 
replace ind_agg = 2 if industry_code == "SIC_0B"
replace ind_agg = 4 if industry_code == "SIC_0C"
replace ind_agg = 5 if industry_code == "SIC_0D20" | regexm(industry_code, "[0][D][2][1-6]$") == 1 | regexm(industry_code, "[0][D][2][8-9]$") == 1 | regexm(industry_code, "[0][D][3][0-9]$") == 1 
replace ind_agg = 7 if industry_code == "SIC_0E40" | industry_code == "SIC_0E46" | industry_code == "SIC_0E41" | industry_code == "SIC_0E42" | industry_code == "SIC_0E44" | industry_code == "SIC_0E45" | industry_code == "SIC_0E472" | industry_code == "SIC_0E473"  | industry_code == "SIC_0E478"
replace ind_agg = 3 if industry_code == "SIC_0E49" 
replace ind_agg = 8 if industry_code == "SIC_0E48" | industry_code == "SIC_0I78" | industry_code == "SIC_0D27" | industry_code == "SIC_0I7370" | industry_code == "SIC_0I7371" | industry_code == "SIC_0I7372" | industry_code == "SIC_0I7373"  | industry_code == "SIC_0I7374"  | industry_code == "SIC_0I7375"  | industry_code == "SIC_0I7376" | industry_code == "SIC_0I7379" | industry_code == "SIC_0D27" 
replace ind_agg = 6 if industry_code == "SIC_0F" | industry_code == "SIC_0G"
replace ind_agg = 9 if industry_code == "SIC_0H" | industry_code == "SIC_0I735" | industry_code == "SIC_0I7377" | industry_code == "SIC_0I751" | industry_code == "SIC_0I784" | industry_code == "SIC_0E474"
replace ind_agg = 13 if industry_code == "SIC_0I72" | industry_code == "SIC_0I86" | industry_code == "SIC_0I752" | industry_code == "SIC_0I753" | industry_code == "SIC_0I754" | industry_code == "SIC_0I76" | industry_code == "SIC_0I88" | industry_code == "SIC_0I89" | industry_code == "SIC_0I7378"
replace ind_agg = 12 if industry_code == "SIC_0I70" | industry_code == "SIC_0I79" | industry_code == "SIC_0I84" | industry_code == "SIC_0I781" | industry_code == "SIC_0I782" | industry_code == "SIC_0I783"   
replace ind_agg = 10 if industry_code == "SIC_0I81" | industry_code == "SIC_0I731" | industry_code == "SIC_0I732" | industry_code == "SIC_0I733" | industry_code == "SIC_0I734" | industry_code == "SIC_0I736" | industry_code == "SIC_0I738" 
replace ind_agg = 11 if industry_code == "SIC_0I80"  | industry_code == "SIC_0I82" | industry_code == "SIC_0I83" 
replace ind_agg = 14 if industry_code == "SIC_0J" | industry_code == "SIC_0E43"
replace ind_agg = 15 if industry_code == "SIC_0Z"
	*total 
replace ind_agg = . if industry_code == "SIC_0K"

drop if ind_agg == . 

#delimit ; 

label define industry
1 "Agriculture, forestry, fishing, and hunting"
2 "Mining" 
3 "Utilities"
4 "Construction" 
5 "Manufacturing" 
6 "Retail and Wholesale Trade"
7 "Transportation and warehousing"
8 "Information"
9 "Finance, Insurance, Real Estate, Rental, and Leasing"
10 "Professional and Business Services"
11 "Educational Services, Health Care, and Social Assistance"
12 "Arts, Entertainment, Recreation, Accommodation, and Food Services"
13 "Other services, except Government"
14 "Government"
15 "Total"
;

#delimit cr

label values ind_agg industry

*collapse by industries 
*mark if all the employment levels in a area-industry-time group are missing (if the first one is missing then all are since . sorted to last)
	*if one component of the industry is missing, then just treat as zero 
bys area_fips ind_agg date_quarter: g allmissing = mi(month1_emplvl[1])
collapse (sum) month1_emplvl month2_emplvl month3_emplvl (min) allmissing, by(area_fips ind_agg date_quarter)
*get the missings back -- collapse makes zero 
replace month1_emplvl= . if allmissing
replace month2_emplvl = . if allmissing
replace month3_emplvl = . if allmissing

*more cleaning
rename area_fips fipstate 

replace fipstate = "0" + fipstate

replace fipstate = substr(fipstate, -5, 2)

destring fipstate, replace

*crosswalk to PSID states
g state=fipstate if fipstate==1
replace state=50 if fipstate==2
replace state=fipstate-2 if fipstate<=6 & fipstate>2
replace state=fipstate-3 if fipstate<15 & fipstate>6
replace state=51 if fipstate==15
replace state=fipstate-5 if fipstate<43 & fipstate>15
replace state=fipstate-6 if fipstate>43 & fipstate<52
replace state=fipstate-7 if fipstate>52 & fipstate<=56

drop if state == 57
	*FBI undesignated location
drop if state == .
	
rename month1_emplvl month1_emplvl_sic
rename month2_emplvl month2_emplvl_sic
rename month3_emplvl month3_emplvl_sic
	
egen total_emp_sic = rowmean(month1_emplvl_sic month2_emplvl_sic month3_emplvl_sic)

save `data'/bartik_sic_st_cleaned.dta, replace

}

if merge_st {

use `data'/bartik_sic_st_cleaned.dta, clear

merge 1:1 state date_quarter ind_agg using `data'/bartik_naic_st_cleaned.dta
drop _merge

g year = yofd(dofq(date_quarter))
gen qtr = quarter(dofq(date_quarter))

*index so can xtset 
egen index =  group(state ind_agg)
xtset index date_quarter

*fill in missing time values
tsfill, full

xfill ind_agg state, i(index)

*replace the tsfills with zeroes 
replace total_emp_sic = 0 if total_emp_sic ==. 
replace total_emp_naic = 0 if total_emp_naic ==. 

*generate SIC growth rates
g sic_gr = (total_emp_sic - L.total_emp_sic) / L.total_emp_sic
replace sic_gr = 0 if sic_gr == . | sic_gr == -1

*generate NAIC growth rates
g naic_gr = (total_emp_naic - L.total_emp_naic) / L.total_emp_naic
replace naic_gr = 0 if naic_gr == . | naic_gr == -1

*make first value at junction a repetition of the base value. 
*Prevents crazy jumps if have, for example, NAIC reconstructed data in 2000 that doesn't match to 2001
	*ex: CBSA10 == 10940 & ind_agg = 14 
replace sic_gr = 0 if year == 2001 & qtr == 1
replace naic_gr = 0 if year == 2000 & qtr == 4 

////
*if first observation at junction is zero, need to rebase where we grow from to the first nonzero observation.
*otherwise will always be zero. 

*mark states/industries where the base employment is going to be zero 
levelsof state, local(st)
levelsof ind_agg, local(in)
g missing_naic = .
foreach s in `st' {
	foreach i in `in' {
		count if total_emp_naic == 0 & year == 2001 & qtr == 1 & state == `s' & ind_agg == `i'
		replace missing_naic = 1 if r(N) >= 1 & state == `s' & ind_agg == `i'
	}
}

*mark the first time SIC employment becomes not negative
gsort state ind_agg -date_quarter
by state ind_agg: g last_nonzero_sic = sum(total_emp_sic != 0) == 1 

*merge in ratio of actual to constructed at the national industry level -- rebase that first observation 
	*so can apply growth rates
merge m:1 ind_agg date_quarter using `data'/bartik_nat_merged, keepusing(date_quarter ind_agg ratio_naic)
drop if _merge == 1
	*total - not using for now
drop _merge 
///

*base at the NAIC 2001 level.
g naic_back = total_emp_naic if year == 2001 & qtr == 1 
*if that level is zero then multiple by national ratio actual/constructed
replace naic_back = total_emp_sic * ratio_naic if last_nonzero_sic == 1 & missing_naic == 1

*grow backwards according to SIC growth rates
count if naic_back == . & date_quarter < 164 & missing_naic == 1
while `r(N)' != 0 {
	bysort state ind_agg (date_quarter): replace naic_back = naic_back[_n+1]/(1+sic_gr[_n+1]) if date_quarter < 164 & missing_naic != 1
	count if naic_back == . & date_quarter < 164 & missing_naic != 1
}

*grow backwards IF needed to rebase. start from new base
levelsof state, local(st)
levelsof ind_agg, local(in)
foreach s in `st' {
	foreach i in `in' {
		su date_quarter if missing_naic ==1 & last_nonzero_sic == 1 & state == `s' & ind_agg == `i'
		if `r(N)' == 0 {
			continue
		}
		g x = `r(mean)'
		count if naic_back == . & date_quarter < x & missing_naic == 1 & state == `s' & ind_agg == `i'
		while `r(N)' != 0 {
			bysort state ind_agg (date_quarter): replace naic_back = naic_back[_n+1]/(1+sic_gr[_n+1]) if date_quarter < x & missing_naic == 1 & state == `s' & ind_agg == `i'
			count if naic_back == . & date_quarter < x & missing_naic == 1 & state == `s' & ind_agg == `i'
		}
		replace naic_back = 0 if date_quarter > x & missing_naic == 1 & state == `s' & ind_agg == `i'
		drop x
	}
}

*splice
g total_emp_back = naic_back if year <= 2000
replace total_emp_back = total_emp_naic if year > 2000
la var total_emp_back "NAICS Backwards"

////////////////
*alternative take SIC 2000q4 level and grow it forwards according to NAIC growth rates

////
*mark states where the first base is going to be zero 
levelsof state, local(st)
levelsof ind_agg, local(in)
g missing_sic = .
foreach s in `st' {
	foreach i in `in'{
		count if total_emp_sic== 0 & year == 2000 & qtr == 4 & state == `s' & ind_agg == `i'
		replace missing_sic = 1 if r(N) >= 1 & state == `s' & ind_agg == `i'
	}
}

*mark the first time SIC employment becomes not negative
gsort state ind_agg date_quarter
by state ind_agg: g first_nonzero_naic = sum(total_emp_naic != 0) == 1 

*merge in ratio of actual to constructed at the national industry level -- rebase that first observation 
	*so can apply growth rates
merge m:1 ind_agg date_quarter using `data'/bartik_nat_merged, keepusing(date_quarter ind_agg ratio_sic)
drop if _merge == 1
	*total - not using for now
drop _merge 
///

*base at the SIC 2000 level.
g sic_forward = total_emp_sic if year == 2000 & qtr == 4 
*if that level is zero then multiple by national ratio actual/constructed
replace sic_forward = total_emp_naic * ratio_sic if first_nonzero_naic == 1 & missing_sic == 1

*grow backwards according to SIC growth rates
count if sic_forward ==. & date_quarter > 163 
while `r(N)' != 0 {
	bysort state ind_agg (date_quarter): replace sic_forward = sic_forward[_n-1]*(1+naic_gr[_n]) if date_quarter > 163
	count if sic_forward == . & date_quarter > 163
}

*grow backwards IF needed to rebase. start from new base
levelsof state, local(st)
levelsof ind_agg, local(in)
foreach s in `st' {
	foreach i in `in' {
		su date_quarter if missing_sic ==1 & first_nonzero_naic == 1 & state == `s' & ind_agg == `i'
		if `r(N)' == 0 {
			continue
		}
		g x = `r(mean)'
		count if sic_forward == . & date_quarter > x & missing_sic == 1 & state == `s' & ind_agg == `i'
		while `r(N)' != 0 {
			bysort state ind_agg (date_quarter): replace naic_back = naic_back[_n+1]/(1+sic_gr[_n+1]) if date_quarter > x & missing_sic == 1 & state == `s' & ind_agg == `i'
			count if sic_forward == . & date_quarter > x & missing_sic == 1 & state == `s' & ind_agg == `i'
		}
		replace sic_forward = 0 if date_quarter > x & missing_sic == 1 & state == `s' & ind_agg == `i'
		drop x
	}
}

*splice
g total_emp_for = total_emp_sic if year <= 2000
replace total_emp_for = sic_forward if year > 2000
la var total_emp_for "SIC Forwards"

rename date_quarter time

keep time state total_emp* ind_agg

save `data'/bartik_st_merged.dta, replace

}

}

if msa {
if naics {
forval y=1999/2016 {
*2016: data folder has sub-industries with broader categories and file names different. Thus download procedure different so don't save a file for each subcategory 

	qui: fs "`data'/naic_county_files/`y'_qtrly_by_industry/`y'.q1-q4.by_industry/`y'.q1-q4*"
	
	if `y' != 2016 {
	
		*pulling data from csv format 
		foreach c in `r(files)' {
			if regexm("`c'", "[1][0] [T][o][t][a][l]") != 1 & regexm("`c'", "[U][n][c][l][a][s][s]") != 1 {
				import delimited using "`data'/naic_county_files/`y'_qtrly_by_industry/`y'.q1-q4.by_industry/`c'", colrange(1:18) varnames(1) clear stringcols(1 4 8 9)
				keep area_fips area_title industry_code year qtr month1_emplvl month2_emplvl month3_emplvl disclosure_code own_title agglvl_code qtrly_estabs_count own_title own_code industry_title
		
				drop if area_fips=="US000"
		
				*keeping the counties
				keep if agglvl_code=="74"
		
				*make sure industry-code is a string so everything will match
				tostring industry_code, replace
				
				*save
				if regexm("`c'", "[0-9] [R][e][t][a][i][l] [t][r][a][d][e]") == 1 | regexm("`c'", "[3][3] [M][a][n][u][f][a][c][t][u][r][i][n][g]") == 1 | regexm("`c'", "[4][9] [T][r]") == 1 {
					local filest = substr("`c'", 12, 5)
				}
				if regexm("`c'", "[4] [1-9][0-6] [A-Z]") == 1 {
					local filest = substr("`c'", 12, 2)
				}
		
				save "`data'/naic_county_files/`filest'_msa_`y'", replace
			}
		}
	}
	
	if `y' == 2016 {
		
		*pulling data from csv format 
		foreach c in `r(files)' {
				
			local c_mod = subinstr("`c'", " NAICS", "", 1)
		
			if regexm("`c_mod'", "[1][0] [T][o][t][a][l]") != 1 & regexm("`c'", "[U][n][c][l][a][s][s]") != 1 {
				import delimited using "`data'/naic_county_files/`y'_qtrly_by_industry/`y'.q1-q4.by_industry/`c'", colrange(1:18) varnames(1) clear stringcols(1 4 8 9)
				keep area_fips area_title industry_code year qtr month1_emplvl month2_emplvl month3_emplvl disclosure_code own_title agglvl_code qtrly_estabs_count own_title own_code industry_title
		
				drop if area_fips=="US000"
		
				*keeping the counties
				keep if agglvl_code=="74"
		
				*make sure industry-code is a string so everything will match
				tostring industry_code, replace
				
				*save
				if regexm("`c_mod'", "[0-9] [R][e][t][a][i][l] [t][r][a][d][e]") == 1 | regexm("`c'", "[3][3] [M][a][n][u][f][a][c][t][u][r][i][n][g]") == 1 | regexm("`c'", "[4][9] [T][r]") == 1 {
					local filest = substr("`c'", 12, 5)
					
					save "`data'/naic_county_files/`filest'_msa_`y'", replace
					
				}
				if regexm("`c_mod'", " [1-9][0-6] [A-Z]") == 1 {
					local filest = substr("`c'", 12, 2)
					
					save "`data'/naic_county_files/`filest'_msa_`y'", replace
					
				}
			}
		}
	}
}


forval y=1999/2016 {
*appending data
use "`data'/naic_county_files/11_msa_`y'", replace

foreach ind in 21 22 23 31-33 42 44-45 48-49 51 52 53 54 55 56 61 62 71 72 81 92 {
	append using "`data'/naic_county_files/`ind'_msa_`y'"
	}

tempfile naic_`y'
save "`naic_`y''"

}


use "`naic_1999'"

forval y=2000/2016 {

	append using "`naic_`y''"

}


g date_quarter = yq(year, qtr)
format date_quarter %tq

**nondisclosures: avg if beg and end, extend if just one, zero if none
bys area_fips industry_code own_code year qtr: g nondis = 1 if disclosure_code == ""

forval i = 1/3 {
	replace month`i'_emplvl = . if disclosure_code != ""
}

*linear interpolation if have real values on either side
sort area_fips industry_code own_code date_quarter
forval i = 1/3 {
	by area_fips industry_code own_code: ipolate month`i'_emplvl date_quarter, g(month`i')
	replace month`i'_emplvl = month`i' if month`i'_emplvl == . 
}

*extend values if have a real value on either side 
forval i = 1/3 {
	sort area_fips industry_code own_code date_quarter
	by area_fips industry_code own_code: replace month`i'_emplvl = month`i'_emplvl[_n-1] if month`i'_emplvl == . 
	gsort area_fips industry_code own_code -date_quarter
	by area_fips industry_code own_code: replace month`i'_emplvl = month`i'_emplvl[_n-1] if month`i'_emplvl == . 
}

*otherwise, nondisclosures are zeroes because no data we can use
forval i = 1/3 {
	replace month`i'_emplvl = 0 if month`i'_emplvl == . 
}

*aggregate by own_code
forval i = 1/3 {
	bysort area_fips industry_code year qtr: ereplace month`i'_emplvl = sum(month`i'_emplvl)
}
			
*Get rid of repetitive observations
collapse (max) month1_emplvl month2_emplvl month3_emplvl (first) area_title industry_title agglvl_code disclosure_code own_code own_title , by(area_fips industry_code date_quarter year qtr) 


*industries
g ind_agg = .
replace ind_agg = 1 if industry_code == "11" 
replace ind_agg = 2 if industry_code == "21"
replace ind_agg = 3 if industry_code == "22"
replace ind_agg = 4 if industry_code == "23"
replace ind_agg = 5 if industry_code == "31-33"
replace ind_agg = 6 if industry_code == "42" | industry_code == "44-45"
replace ind_agg = 7 if industry_code == "48-49"
replace ind_agg = 8 if industry_code == "51"
replace ind_agg = 9 if industry_code == "52" | industry_code == "53"
replace ind_agg = 10  if industry_code == "54" | industry_code == "55" | industry_code == "56"
replace ind_agg = 11 if industry_code == "61" | industry_code == "62"
replace ind_agg = 12 if industry_code == "71" | industry_code == "72"
replace ind_agg = 13 if industry_code == "81"
replace ind_agg = 14 if industry_code == "92"
*replace ind_agg = . if industry_code == "99"
drop if industry_code == "99"

#delimit ; 

label define industry
1 "Agriculture, forestry, fishing, and hunting"
2 "Mining" 
3 "Utilities"
4 "Construction" 
5 "Manufacturing" 
6 "Retail and Wholesale Trade"
7 "Transportation and warehousing"
8 "Information"
9 "Finance, Insurance, Real Estate, Rental, and Leasing"
10 "Professional and Business Services"
11 "Educational Services, Health Care, and Social Assistance"
12 "Arts, Entertainment, Recreation, Accomodation, and Food Services"
13 "Other services, except Government"
14 "Government"
;

#delimit cr

label values ind_agg industry

*collapse by industries 
*mark if all the employment levels in a area-industry-time group are missing (if the first one is missing then all are since . sorted to last)
	*if one component of the industry is missing, then just treat as zero 
bys area_fips ind_agg date_quarter: g allmissing = mi(month1_emplvl[1])
collapse (sum) month1_emplvl month2_emplvl month3_emplvl (min) allmissing, by(area_fips ind_agg date_quarter year qtr)
*get the missings back -- collapse makes zero 
replace month1_emplvl= . if allmissing
replace month2_emplvl = . if allmissing
replace month3_emplvl = . if allmissing

destring area_fips, gen(areafips)

tempfile naic_data
save "`naic_data'"


*matching with PSID-consistent (2013) CBSA codes

*Using the excel sheet of CBSA codes 2013
import excel `data'/census_cbsa_2013_codes.xlsx, cellrange(A1:K1883) firstrow case(lower) clear

drop metrodivisioncode csacode metropolitandivisiontitle csatitle 

rename metropolitanmicropolitanstatis  msa_type
rename countycountyequivalent county_name

g metro_dum = 0
replace metro_dum = 1 if msa_type == "Metropolitan Statistical Area"

g area_fips = fipsstatecode + fipscountycode

merge 1:m area_fips using "`naic_data'"

**Notes on non-merges:
/* 
* 1. the nonmerging observation from the excel file is Kalawao county. 
This is fine because elsewhere Kalawao county is combined with Maui county in the excel file, and that merges with the NAIC data.
So we don't need the Kalawao county code, since it's included elsewhere
* 2. After doing some spot checking, it looks like the non-merges (for _merge==2) are for counties not in an MSA
*/
keep if _merge==3
drop _merge

count if month1_emplvl==.

destring cbsacode, gen(CBSA10)

*19740 is Denver-Aurora-Lakewook in Colorado. The issue is that Broomfield County is a new county in 2002, and before parts were in the Boulder MSA. Not sure what to do about this one. (The totals for employment actually drop in 2002, which also doesn't make any sense.)
*Since we don't have PSID data in 2002, this shouldn't affect us too much. Need to check how many observations we lose from Denver MSA in 2003q1, since we would need the growth rate for that. Joe thinks if it's a lot, we could just not lag it in that case.
*We lose 4 in Denver, 0 in Boulder, and 0 in Greeley in 2003q1. We also care about Boulder and Greeley since the county was also in Boulder and Greeley. 
*Making Denver MSA, Boulder MSA and Greeley MSA missing in 2002 

**Denver
replace month1_emplvl=. if year==2002 & CBSA10== 19740
replace month2_emplvl=. if year==2002 & CBSA10== 19740
replace month3_emplvl=. if year==2002 & CBSA10== 19740

**Boulder
replace month1_emplvl=. if year==2002 & CBSA10== 14500
replace month2_emplvl=. if year==2002 & CBSA10== 14500
replace month3_emplvl=. if year==2002 & CBSA10== 14500

**Greeley MSA
replace month1_emplvl=. if year==2002 & CBSA10==24540
replace month2_emplvl=. if year==2002 & CBSA10==24540
replace month3_emplvl=. if year==2002 & CBSA10==24540


rename month1_emplvl month1_emplvl_naic
rename month2_emplvl month2_emplvl_naic
rename month3_emplvl month3_emplvl_naic

*Fixing problems/issues
*getting rid of observations not in the PSID, because I don't care about their issues
*cbsa_list_13 is a list of all CBSAs in the PSID (from confidential data) 
rename CBSA10 CBSA
merge m:1 CBSA using `data'/cbsa_list_13.dta, gen(in_psid) 
rename CBSA CBSA10
*the nonmerges from PSID are just 99998 and 99999

keep if in_psid==3
drop in_psid

save `data'/bartik_naic_msa_cleaned.dta, replace

}

if sic {
forval y=1975/2000 {

	qui: fs "`data'/sic_files/sic_`y'_qtrly_by_industry/sic.`y'.q1-q4.by_industry/sic.`y'.q1-q4*"
	
	*pulling the data from csv format
	foreach c in `r(files)' {
	
		if regexm("`c'", "[0][A-K] \(") == 1 | regexm("`c'", "[0][I][7-8][0-9] \(") == 1 | regexm("`c'", "[0][E][4][0-9] \(") == 1 | regexm("`c'", "[0][I][7-8][0-9][0-9] \(") == 1 | regexm("`c'", "[0][I][7-8][0-9][0-9][0-9] \(") == 1 | regexm("`c'", "[0][E][4][7][0-9] \(") == 1 | regexm("`c'", "[0][D][2-3][0-9] \(") == 1 {
			import delimited using "`data'/sic_files/sic_`y'_qtrly_by_industry/sic.`y'.q1-q4.by_industry/`c'", varnames(1) colrange(1:17) clear
			keep area_fips area_title year qtr month1_emplvl month2_emplvl month3_emplvl industry_code industry_title disclosure_code agglvl_code own_code own_title
	
			*keeping counties
			keep if agglvl_code== 28 | agglvl_code == 29 | agglvl_code == 30 | agglvl_code == 31 
	
			*make sure industry-code is a string so everything will match
			tostring industry_code, replace
			tostring disclosure_code, replace
	
			*save
			if regexm("`c'", "[0][A-K] \(") == 1 {
				local filest = substr("`c'", 16, 2)
			}
			if regexm("`c'", "[0][I][7-8][0-9] \(") == 1 | regexm("`c'", "[0][E][4][0-9] \(") == 1 | regexm("`c'", "[0][D][2-3][0-9] \(") == 1 {
				local filest = substr("`c'", 16, 4)
			}
			if regexm("`c'", "[0][I][7-8][0-9][0-9] \(") == 1 | regexm("`c'", "[0][E][4][7][0-9] \(") == 1 {
				local filest = substr("`c'", 16, 5)
			}
			if regexm("`c'", "[0][I][7-8][0-9][0-9][0-9] \(") == 1 {
				local filest = substr("`c'", 16, 6)
			}
			
			save "`data'/sic_files/`filest'_msa_`y'", replace
		}
	}
}

*append
forval y=1975/2000{

use "`data'/sic_files/0A_msa_`y'", clear

foreach letter in B C F G H J K D20 D21 D22 D23 D24 D25 D26 D27 D28 D29 D30 D31 D32 D33 D34 D35 D36 D37 D38 D39 I70 I72 I76 I79 I731 I732 I733 I734 I735 I736 I7370 I7371 I7372 I7373 I7374 I7375 I7376 I7377 I7378 I7379 I738 I751 I752 I754 I753 I754 I781 I782 I783 I784 I80 I81 I82 I83 I84 I86 I88 I89 E40 E41 E42 E43 E44 E45 E46 E472 E473 E474 E478 E48 E49 {
	capture: append using "`data'/sic_files/0`letter'_msa_`y'"
}

tempfile sic_`y'
save `sic_`y''

}

use "`sic_1975'"

forval y=1976/2000 {

	append using "`sic_`y''"

}


g date_quarter = yq(year, qtr)
format date_quarter %tq

**nondisclosures: avg if beg and end, extend if just one, zero if none
bys area_fips industry_code own_code year qtr: g nondis = 1 if disclosure_code == ""

forval i = 1/3 {
	replace month`i'_emplvl = . if disclosure_code != ""
}


*linear interpolation if have real values on either side
sort area_fips industry_code own_code date_quarter
forval i = 1/3 {
	by area_fips industry_code own_code: ipolate month`i'_emplvl date_quarter, g(month`i')
	replace month`i'_emplvl = month`i' if month`i'_emplvl == . 
}

*extend values if have a real value on either side 
forval i = 1/3 {
	sort area_fips industry_code own_code date_quarter
	by area_fips industry_code own_code: replace month`i'_emplvl = month`i'_emplvl[_n-1] if month`i'_emplvl == . 
	gsort area_fips industry_code own_code -date_quarter
	by area_fips industry_code own_code: replace month`i'_emplvl = month`i'_emplvl[_n-1] if month`i'_emplvl == . 
}

*otherwise, nondisclosures are zeroes because no data we can use
forval i = 1/3 {
	replace month`i'_emplvl = 0 if month`i'_emplvl == . 
}

*aggregate by own_code
forval i = 1/3 {
	bysort area_fips industry_code year qtr: ereplace month`i'_emplvl = sum(month`i'_emplvl)
}
			
*Get rid of repetitive observations
collapse (max) month1_emplvl month2_emplvl month3_emplvl (first) area_title industry_title agglvl_code disclosure_code own_code own_title , by(area_fips industry_code date_quarter year qtr) 

*industries
g ind_agg = .
replace ind_agg = 1 if industry_code == "SIC_0A" 
replace ind_agg = 2 if industry_code == "SIC_0B"
replace ind_agg = 4 if industry_code == "SIC_0C"
replace ind_agg = 5 if industry_code == "SIC_0D20" | regexm(industry_code, "[0][D][2][1-6]$") == 1 | regexm(industry_code, "[0][D][2][8-9]$") == 1 | regexm(industry_code, "[0][D][3][0-9]$") == 1 
replace ind_agg = 7 if industry_code == "SIC_0E40" | industry_code == "SIC_0E46" | industry_code == "SIC_0E41" | industry_code == "SIC_0E42" | industry_code == "SIC_0E44" | industry_code == "SIC_0E45" | industry_code == "SIC_0E472" | industry_code == "SIC_0E473"  | industry_code == "SIC_0E478"
replace ind_agg = 3 if industry_code == "SIC_0E49" 
replace ind_agg = 8 if industry_code == "SIC_0E48" | industry_code == "SIC_0I78" | industry_code == "SIC_0D27" | industry_code == "SIC_0I7370" | industry_code == "SIC_0I7371" | industry_code == "SIC_0I7372" | industry_code == "SIC_0I7373"  | industry_code == "SIC_0I7374"  | industry_code == "SIC_0I7375"  | industry_code == "SIC_0I7376" | industry_code == "SIC_0I7379" | industry_code == "SIC_0D27" 
replace ind_agg = 6 if industry_code == "SIC_0F" | industry_code == "SIC_0G"
replace ind_agg = 9 if industry_code == "SIC_0H" | industry_code == "SIC_0I735" | industry_code == "SIC_0I7377" | industry_code == "SIC_0I751" | industry_code == "SIC_0I784" | industry_code == "SIC_0E474"
replace ind_agg = 13 if industry_code == "SIC_0I72" | industry_code == "SIC_0I86" | industry_code == "SIC_0I752" | industry_code == "SIC_0I753" | industry_code == "SIC_0I754" | industry_code == "SIC_0I76" | industry_code == "SIC_0I88" | industry_code == "SIC_0I89" | industry_code == "SIC_0I7378"
replace ind_agg = 12 if industry_code == "SIC_0I70" | industry_code == "SIC_0I79" | industry_code == "SIC_0I84" | industry_code == "SIC_0I781" | industry_code == "SIC_0I782" | industry_code == "SIC_0I783"   
replace ind_agg = 10 if industry_code == "SIC_0I81" | industry_code == "SIC_0I731" | industry_code == "SIC_0I732" | industry_code == "SIC_0I733" | industry_code == "SIC_0I734" | industry_code == "SIC_0I736" | industry_code == "SIC_0I738" 
replace ind_agg = 11 if industry_code == "SIC_0I80"  | industry_code == "SIC_0I82" | industry_code == "SIC_0I83" 
replace ind_agg = 14 if industry_code == "SIC_0J" | industry_code == "SIC_0E43"
replace ind_agg = 15 if industry_code == "SIC_0Z"
	*total 
replace ind_agg = . if industry_code == "SIC_0K"

drop if ind_agg == .  

#delimit ; 

label define industry
1 "Agriculture, forestry, fishing, and hunting"
2 "Mining" 
3 "Utilities"
4 "Construction" 
5 "Manufacturing" 
6 "Retail and Wholesale Trade"
7 "Transportation and warehousing"
8 "Information"
9 "Finance, Insurance, Real Estate, Rental, and Leasing"
10 "Professional and Business Services"
11 "Educational Services, Health Care, and Social Assistance"
12 "Arts, Entertainment, Recreation, Accomodation, and Food Services"
13 "Other services, except Government"
14 "Government"
;

#delimit cr

label values ind_agg industry

*collapse by industries 
*mark if all the employment levels in a area-industry-time group are missing (if the first one is missing then all are since . sorted to last)
	*if one component of the industry is missing, then just treat as zero 
bys area_fips ind_agg date_quarter: g allmissing = mi(month1_emplvl[1])
collapse (sum) month1_emplvl month2_emplvl month3_emplvl (min) allmissing, by(area_fips ind_agg date_quarter year qtr)
*get the missings back -- collapse makes zero 
replace month1_emplvl= . if allmissing
replace month2_emplvl = . if allmissing
replace month3_emplvl = . if allmissing


destring(area_fips), gen(areafips)

keep areafips month* year qtr ind_agg date_quarter

rename month1_emplvl month1_emplvl_sic
rename month2_emplvl month2_emplvl_sic
rename month3_emplvl month3_emplvl_sic

save `data'/bartik_sic_msa_pre.dta, replace

*the DC county values are not disclused in 1990 and 1998 for some reason. 
*Use state data to replace those values with state values -- the county and the state values are otherwise the same. 
use "`data'/bartik_sic_st_cleaned.dta", clear
keep if state == 8
format date_quarter %tq
rename month1_emplvl month1_emplvl_sic_st
rename month2_emplvl month2_emplvl_sic_st
rename month3_emplvl month3_emplvl_sic_st
tempfile sic_st_dc
save `sic_st_dc', replace

use "`data'/bartik_sic_msa_pre.dta", clear
keep if areafips == 11000 | areafips == 11001
merge 1:m date_quarter ind_agg using `sic_st_dc'

foreach var in month1_emplvl_sic month2_emplvl_sic month3_emplvl_sic {
	
	su `var'_st if year==1989 & qtr==1 & areafips==11000
	replace `var'=r(mean) if year==1989 & qtr==1 & areafips==11001
	
	su `var'_st if year==1989 & qtr==2 & areafips==11000
	replace `var'=r(mean) if year==1989 & qtr==2 & areafips==11001
	
	su `var'_st if year==1989 & qtr==3 & areafips==11000
	replace `var'=r(mean) if year==1989 & qtr==3 & areafips==11001
	
	su `var'_st if year==1989 & qtr==4 & areafips==11000
	replace `var'=r(mean) if year==1989 & qtr==4 & areafips==11001
	
	su `var'_st if year==1990 & qtr==1 & areafips==11000
	replace `var'=r(mean) if year==1990 & qtr==1 & areafips==11001

	su `var'_st if year==1990 & qtr==2 & areafips==11000
	replace `var'=r(mean) if year==1990 & qtr==2 & areafips==11001
	
}

*Merge back in rest of SIC data
keep areafips year qtr date_quarter month1_emplvl_sic month2_emplvl_sic month3_emplvl_sic fipstate state

tempfile sic_st_dc_fix
save `sic_st_dc_fix', replace

merge m:m date_quarter areafips using "`data'/bartik_sic_msa_pre.dta"
drop _merge

*Dade County changed to Miami-Dade County in 1990q1 (in QCEW data) (33100)
replace areafips = 12086 if areafips == 12025

tempfile sic_data
save "`sic_data'", replace

*matching with CBSA codes
*Using the excel sheet with CBSA codes 2013
import excel `data'/census_cbsa_2013_codes.xlsx, cellrange(A1:K1883) firstrow case(lower) clear

drop metrodivisioncode csacode metropolitandivisiontitle csatitle 

rename metropolitanmicropolitanstatis  msa_type
rename countycountyequivalent county_name

g metro_dum = 0
replace metro_dum = 1 if msa_type == "Metropolitan Statistical Area"

g areafips = fipsstatecode + fipscountycode
destring areafips, replace

merge 1:m areafips using "`sic_data'"

**Notes on non-merges:
/*
* 1. the nonmerging observation from the excel file is Kalawao county. 
This is fine because elsewhere Kalawao county is combined with Maui county in the excel file, and that merges with the NAIC data.
So we don't need the Kalawao county code, since it's included elsewhere.
The other _merge == 1 is from Broomfield county, which doesn't exist in the SIC time frame so this is fine to throw out. 
* 2. After doing some spot checking, it looks like the non-merges (for _merge==2) are for counties not in an MSA
*/

keep if _merge==3
drop _merge
count if month1_emplvl==.

tab cbsacode if month1_emplvl==. & year>1982

destring cbsacode, gen(CBSA10)


/*Check if county makeup of MSAs changes over time*/
bysort CBSA10 year fipscountycode: g nvals = _n == 1
by CBSA10 year: replace nvals = sum(nvals)
by CBSA10 year: replace nvals = nvals[_N]

*br nvals fipscountycode CBSA10 state year

bysort CBSA10: gen diff = 1 if nvals[1] != nvals[_N]
tab CBSA10 if diff == 1
*17980: Muscogee county, Georgia shows up in 1982. Before PSID so don't care. 
*47260: County added in 1976 -- before PSID data so don't care 
*47900: 2 counties added in 1976 -- before PSID data so don't care 

*Fixing problems/issues

*getting rid of observations not in the PSID, because I don't care about their issues
rename CBSA10 CBSA
merge m:1 CBSA using `data'/cbsa_list_13.dta, gen(in_psid) 
rename CBSA CBSA10
*the nonmerges from PSID are just 99998 and 99999

keep if in_psid==3
drop in_psid

keep if year>1982

save "`data'/bartik_sic_msa_cleaned.dta", replace

}

if merge_msa {
*Calculate NAIC and SIC growth rates seperately then splice

***NAICS data**
use `data'/bartik_naic_msa_cleaned.dta, replace
drop if year <= 1982

egen total_emp_naic = rowmean(month1_emplvl_naic month2_emplvl_naic month3_emplvl_naic)

destring fipsstatecode, replace
collapse (sum) total_emp_naic (mean) metro_dum fipsstatecode, by(CBSA10 ind_agg year qtr)
*This creates quarterly MSA level data by summing months and then summing counties 

g date_quarter = yq(year, qtr)
format date_quarter %tq

save `data'/bartik_naic_msa_temp, replace

***SIC data***
use "`data'/bartik_sic_msa_cleaned.dta", clear
drop if year <= 1982 

destring fipscountycode, g(county_code)

*Collapse to get quarterly MSA data
egen total_emp_sic = rowmean(month1_emplvl_sic month2_emplvl_sic month3_emplvl_sic)
	*STOCK variable

*Collapse
destring fipsstatecode, replace
collapse (sum) total_emp_sic (mean) metro_dum fipsstatecode, by(CBSA10 ind_agg year qtr)

g date_quarter = yq(year, qtr)
format date_quarter %tq

save "`data'/bartik_sic_msa_temp.dta", replace

***Merge collapsed NAIC and SIC datasets****
merge 1:1 CBSA10 ind_agg date_quarter using `data'/bartik_naic_msa_temp.dta
drop _merge

*index so can xtset 
egen index =  group(CBSA10 ind_agg)

xtset index date_quarter

*if any MSAs are missing industry data for any time periods, fill in with zeroes. 
tsfill, full

*fill in industry and MSA names 
xfill ind_agg CBSA10, i(index)

drop year
drop qtr
g year = yofd(dofq(date_quarter))
gen qtr = quarter(dofq(date_quarter))

*replace tsfills with zero
replace total_emp_sic = 0 if total_emp_sic ==. 
replace total_emp_naic = 0 if total_emp_naic ==. 

*generate SIC growth rates
g sic_gr = (total_emp_sic - L.total_emp_sic) / L.total_emp_sic
replace sic_gr = 0 if sic_gr == . | sic_gr == -1

*generate NAIC growth rates
g naic_gr = (total_emp_naic - L.total_emp_naic) / L.total_emp_naic
replace naic_gr = 0 if naic_gr == . | naic_gr == -1

*this means the first value will be a repetition of the base value. 
*Prevents crazy jumps if have, for example, NAIC reconstructed data in 2000 that doesn't match to 2001
	*ex: CBSA10 == 10940 & ind_agg = 14 
replace sic_gr = 0 if year == 2001 & qtr == 1
replace naic_gr = 0 if year == 2001 & qtr == 1

////////////
*mark MSAs where the first base is going to be zero 
qui: levelsof CBSA10, local(cbsa)
g missing_naic = .
foreach c in `cbsa' {
	forval i = 1/14 {
		count if total_emp_naic == 0 & year == 2001 & qtr == 1 & CBSA10 == `c' & ind_agg == `i'
		replace missing_naic = 1 if `r(N)' >= 1 & CBSA10 == `c' & ind_agg == `i'
	}
}

*merge in ratio of actual to constructed at the national industry level -- rebase that first observation 
	*so can apply growth rates
merge m:1 ind_agg date_quarter using `data'/bartik_nat_merged, keepusing(date_quarter ind_agg ratio_naic)
keep if _merge == 3
	* nonmatching years (dropped before 1983)
drop _merge 
///

*mark the first time SIC employment becomes not negative
gsort CBSA10 ind_agg -date_quarter
by CBSA10 ind_agg: g last_nonzero_sic = sum(total_emp_sic != 0) == 1 & ratio_naic != . 


*base at the NAIC 2001 level.
g naic_back = total_emp_naic if year == 2001 & qtr == 1 
*if that level is zero then multiple by national ratio actual/constructed
replace naic_back = total_emp_sic * ratio_naic if last_nonzero_sic == 1 & missing_naic == 1

*grow backwards according to SIC growth rates
count if naic_back == . & date_quarter < 164 & missing_naic != 1
while `r(N)' != 0 {
	bysort CBSA10 ind_agg (date_quarter): replace naic_back = naic_back[_n+1]/(1+sic_gr[_n+1]) if date_quarter < 164 & missing_naic != 1
	count if naic_back == . & date_quarter < 164 & missing_naic != 1
}

*grow backwards IF needed to rebase. start from new base
levelsof CBSA10, local(cbsa)
foreach c in `cbsa' {
	forval i = 1/14 {
		di "`c'"
		di "`i'"
		su date_quarter if missing_naic ==1 & last_nonzero_sic == 1 & CBSA10 == `c' & ind_agg == `i'
		if `r(N)' == 0 {
			continue
		}
		g x = `r(mean)'
		count if naic_back == . & date_quarter < x & missing_naic == 1 & CBSA10 == `c' & ind_agg == `i'
		while `r(N)' != 0 {
			bysort CBSA10 ind_agg (date_quarter): replace naic_back = naic_back[_n+1]/(1+sic_gr[_n+1]) if date_quarter < x & missing_naic == 1 & CBSA10 == `c' & ind_agg == `i'
			count if naic_back == . & date_quarter < x & missing_naic == 1 & CBSA10 == `c' & ind_agg == `i'
		}
		replace naic_back = 0 if date_quarter > x & missing_naic == 1 & CBSA10 == `c' & ind_agg == `i'
		drop x
	}
}

*splice
g total_emp_back = naic_back if year <= 2000
replace total_emp_back = total_emp_naic if year > 2000
la var total_emp_back "NAICS Backwards"

save `data'/bartik_msa_midsplice, replace


use `data'/bartik_msa_midsplice, clear

////////////////
*alternative take SIC 2000q4 level and grow it forwards according to NAIC growth rates

////
*mark MSAs where the first base is going to be zero 
levelsof CBSA10, local(cbsa)
g missing_sic = .
foreach c in `cbsa' {
	forval i = 1/14 {
		count if total_emp_sic== 0 & year == 2000 & qtr == 4 & CBSA10 == `c' & ind_agg == `i'
		replace missing_sic = 1 if r(N) >= 1 & CBSA10 == `c' & ind_agg == `i'
	}
}

*merge in ratio of actual to constructed at the national industry level -- rebase that first observation 
	*so can apply growth rates
merge m:1 ind_agg date_quarter using `data'/bartik_nat_merged, keepusing(date_quarter ind_agg ratio_sic)
drop if _merge == 1
	*total - not using for now
drop _merge 
///

*mark the first time NAIC employment becomes not negative (and there is a SIC ratio to multiply by -- sometimes you get NAIC levels in 1999q1)
gsort CBSA10 ind_agg date_quarter
by CBSA10 ind_agg: g first_nonzero_naic = sum(total_emp_naic != 0) == 1 if ratio_sic != . 

*base at the SIC 2000 level.
g sic_forward = total_emp_sic if year == 2000 & qtr == 4 
*if that level is zero then multiple by national ratio actual/constructed
replace sic_forward = total_emp_naic * ratio_sic if first_nonzero_naic == 1 & missing_sic == 1

*grow forwards according to SIC growth rates
count if sic_forward ==. & date_quarter > 163 
while `r(N)' != 0 {
	bysort CBSA10 ind_agg (date_quarter): replace sic_forward = sic_forward[_n-1]*(1+naic_gr[_n]) if date_quarter > 163  & missing_sic != 1
	count if sic_forward == . & date_quarter > 163  & missing_sic != 1
}

*grow forwards IF needed to rebase. start from new base
levelsof CBSA10, local(cbsa)
foreach c in `cbsa' {
	forval i = 1/14 {
		di "`c'"
		di "`i'"
		su date_quarter if missing_sic ==1 & first_nonzero_naic == 1 & CBSA10 == `c' & ind_agg == `i'
		if `r(N)' == 0 {
			continue
		}
		g x = `r(mean)'
		count if sic_forward == . & date_quarter > x & missing_sic == 1 & CBSA10 == `c' & ind_agg == `i'
		while `r(N)' != 0 {
			bysort CBSA10 ind_agg (date_quarter): replace sic_forward = sic_forward[_n-1]*(1+naic_gr[_n]) if date_quarter > x & missing_sic == 1 & CBSA10 == `c' & ind_agg == `i'
			count if sic_forward == . & date_quarter > x & missing_sic == 1 & CBSA10 == `c' & ind_agg == `i'
		}
		replace sic_forward = 0 if date_quarter < x & missing_sic == 1 & CBSA10 == `c' & ind_agg == `i'
		drop x
	}
}

*splice
g total_emp_for = total_emp_sic if year <= 2000
replace total_emp_for = sic_forward if year > 2000
la var total_emp_for "SIC Forwards"
rename date_quarter time


keep total_emp* time CBSA10 ind_agg 

drop if CBSA10 == . 

save `data'/bartik_msa_merged.dta, replace

}
}


*************************************
*calculate bartik employment growth 
**************************************

* Note there is a "forward" and "backward" Bartik measure based on how SIC and NAICS is spliced together
* whether SIC employment levels are growth out forward or bartik levels are extendedd backwards 
* annual and quarterly (weighted and unweighted) measures are also created. Weights are controlled by flag
* at the beginning of the program.


if weights {
set graphics off

if state_weights {
use `data'/bartik_st_merged, clear

g year = yofd(dofq(time))
gen qtr = quarter(dofq(time))

**************Forward measure******************
*aggregate by industry to get full employment measure in the state
g sum_emp = .
bys state time: ereplace sum_emp = sum(total_emp_for)

*generate share of industry employment of all employment in state 
g share = total_emp_for / sum_emp

*employment shares should sum to 1 within a state and time 
bys state time: egen check = sum(share)
su check

tempfile state
save `state'

/////
*create national industry employment measure
use `data'/bartik_nat_merged, clear

rename date_quarter time

rename total_emp_naic us_emp_naic
rename total_emp_sic us_emp_sic

rename total_emp_for us_emp_for
rename total_emp_back us_emp_back

keep time ind_agg us_emp_naic us_emp_sic us_emp_for us_emp_back

merge 1:m ind_agg time using `state'
drop _merge

*create 'restricted' national industry employment for a state that excludes employment in that state
bys state ind_agg time: g us_emp_restr_naic = us_emp_naic - total_emp_naic
bys state ind_agg time: g us_emp_restr_sic = us_emp_sic - total_emp_sic

*reconstructed version for growth splice 
bys state ind_agg time: g total_emp_for_restr = us_emp_for - total_emp_for

egen index =  group(state ind_agg)
xtset index time

*national industry employment growth rate
*splice! Use reconstructed NAICS in 2000 to create 2000-2001 growth rate
	*our reconstructed based on SIC growth rates 
g us_emp_gr_naic = 100*((us_emp_naic/L4.us_emp_naic)-1)
g us_emp_gr_sic = 100*((us_emp_sic/L4.us_emp_sic)-1)

g us_emp_gr = us_emp_gr_sic if year <= 2000
replace us_emp_gr = us_emp_gr_naic if year > 2001

forval q = 1/4 {
	replace us_emp_gr = 100*((us_emp_naic/L4.us_emp_for)-1) if year == 2001 & qtr == `q'
}

g us_emp_restr_gr_naic = 100*((us_emp_restr_naic/L4.us_emp_restr_naic)-1)
g us_emp_restr_gr_sic = 100*((us_emp_restr_sic/L4.us_emp_restr_sic)-1)

g us_emp_restr_gr = us_emp_restr_gr_sic if year <= 2000
replace us_emp_restr_gr = us_emp_restr_gr_naic if year > 2001

forval q = 1/4 {
	replace us_emp_restr_gr = ((us_emp_restr_naic/L4.total_emp_for_restr)-1)*100 if year == 2001 & qtr == `q'
}

/////
*weight national industry growth rates by lagged share of employment of industry in each state
g us_emp_gr_w = L.share*us_emp_gr
g us_emp_restr_gr_w = L.share*us_emp_restr_gr

*sum up all of the growth rates to get the overall state growth rate
bys state time: egen bartik_empgr_for_st = sum(us_emp_gr_w)
bys state time: egen bartik_empgr_restr_for_st = sum(us_emp_restr_gr_w)

replace bartik_empgr_for_st = . if time < 64
replace bartik_empgr_restr_for_st = . if time < 64
	*first time period, becomes zero but actually missing

la var bartik_empgr_for_st "Bartik Empl. Growth"
la var bartik_empgr_restr_for_st "Bartik Empl. Growth (restricted)"

*lag 
xtset index time 
g l_bartik_empgr_for_st = L.bartik_empgr_for_st
replace bartik_empgr_for_st = l_bartik_empgr_for_st
g l_bartik_empgr_restr_for_st = L.bartik_empgr_restr_for_st
replace bartik_empgr_restr_for_st = l_bartik_empgr_restr_for_st

*collapse to state-time level dataset
collapse (max) bartik_empgr_for_st bartik_empgr_restr_for_st, by(state year qtr time)

save `data'/bartik_st_weighted_for, replace


///////////////////////////////////////////////////////////////////////////////
use `data'/bartik_st_merged, clear

g year = yofd(dofq(time))
gen qtr = quarter(dofq(time))

**************Backwards measure******************
*aggregate by industry to get full employment measure in the state
g sum_emp = .
bys state time: ereplace sum_emp = sum(total_emp_back)

*generate share of industry employment of all employment in state 
g share = total_emp_back / sum_emp

*employment shares should sum to 1 within a state and time 
bys state time: egen check = sum(share)

tempfile state
save `state'

/////
*create national industry employment measure
use `data'/bartik_nat_merged, clear

rename date_quarter time

rename total_emp_naic us_emp_naic
rename total_emp_sic us_emp_sic

rename total_emp_for us_emp_for
rename total_emp_back us_emp_back

keep time ind_agg us_emp_naic us_emp_sic us_emp_for us_emp_back

merge 1:m ind_agg time using `state'
drop _merge

*create national industry employment for a state that excludes employment in that state
bys state ind_agg time: g us_emp_restr_naic = us_emp_naic - total_emp_naic
bys state ind_agg time: g us_emp_restr_sic = us_emp_sic - total_emp_sic

*reconstructed version for growth splice 
bys state ind_agg time: g total_emp_back_restr = us_emp_back - total_emp_back

egen index =  group(state ind_agg)
xtset index time

*national industry employment growth rate
*splice! Use reconstructed NAICS in 2000 to create 2000-2001 growth rate
	*our reconstructed based on SIC growth rates 
g us_emp_gr_naic = 100*((us_emp_naic/L4.us_emp_naic)-1)
g us_emp_gr_sic = 100*((us_emp_sic/L4.us_emp_sic)-1)

g us_emp_gr = us_emp_gr_sic if year <= 2000
replace us_emp_gr = us_emp_gr_naic if year > 2001

forval q = 1/4 {
	replace us_emp_gr = 100*((us_emp_naic/L4.us_emp_back)-1) if year == 2001 & qtr == `q'
}

g us_emp_restr_gr_naic = 100*((us_emp_restr_naic/L4.us_emp_restr_naic)-1)
g us_emp_restr_gr_sic = 100*((us_emp_restr_sic/L4.us_emp_restr_sic)-1)

g us_emp_restr_gr = us_emp_restr_gr_sic if year <= 2000
replace us_emp_restr_gr = us_emp_restr_gr_naic if year > 2001

forval q = 1/4 {
	replace us_emp_restr_gr = ((us_emp_restr_naic/L4.total_emp_back_restr)-1)*100 if year == 2001 & qtr == `q'
}

/////
*weight national industry growth rates by lagged share of employment of industry in each state
g us_emp_gr_w = L.share*us_emp_gr
g us_emp_restr_gr_w = L.share*us_emp_restr_gr

*sum up all of the growth rates to get the overall state growth rate
bys state time: egen bartik_empgr_back_st = sum(us_emp_gr_w)
bys state time: egen bartik_empgr_restr_back_st = sum(us_emp_restr_gr_w)

replace bartik_empgr_back_st = . if time < 64
replace bartik_empgr_restr_back_st = . if time < 64
	*first time period, becomes zero but actually missing

la var bartik_empgr_back_st "Bartik Empl. Growth"
la var bartik_empgr_restr_back_st "Bartik Empl. Growth (restricted)"

*lag 
xtset index time 
g l_bartik_empgr_back_st = L.bartik_empgr_back_st
replace bartik_empgr_back_st = l_bartik_empgr_back_st
g l_bartik_empgr_restr_back_st = L.bartik_empgr_restr_back_st
replace bartik_empgr_restr_back_st = l_bartik_empgr_restr_back_st

*collapse to state-time level dataset
collapse (max) bartik_empgr_back_st bartik_empgr_restr_back_st, by(state year qtr time)

save `data'/bartik_st_weighted_back, replace
}

if msa_weights {
***************QUARTERLY**********************
**************Forward measure******************

use `data'/bartik_msa_merged, clear

g year = yofd(dofq(time))
g qtr = quarter(dofq(time))

*aggregate by industry to get full employment measure in the state
g sum_emp = .
bys CBSA10 time: ereplace sum_emp = sum(total_emp_for)

*generate share of industry employment of all employment in state 
g share = total_emp_for / sum_emp

*employment shares should sum to 1 within a state and time 
bys CBSA10 time: egen check = sum(share)
*one zero case: because all industries are zero in those years (before the BH sample begins)

tempfile CBSA10
save `CBSA10'

/////
*create national industry employment measure
use `data'/bartik_nat_merged, clear

rename date_quarter time

rename total_emp_naic us_emp_naic
rename total_emp_sic us_emp_sic
rename total_emp_for us_emp_for
rename total_emp_back us_emp_back

keep time ind_agg us_emp_naic us_emp_sic us_emp_for us_emp_back

merge 1:m ind_agg time using `CBSA10'
keep if _merge == 3 /*time issue*/
drop _merge

*create national industry employment for a MSA that excludes employment in that MSA
bys CBSA10 ind_agg time: g us_emp_restr_naic = us_emp_naic - total_emp_naic
bys CBSA10 ind_agg time: g us_emp_restr_sic = us_emp_sic - total_emp_sic

*reconstructed version for growth splice 
bys CBSA10 ind_agg time: g total_emp_for_restr = us_emp_for - total_emp_for

egen index =  group(CBSA10 ind_agg)
xtset index time

*national industry employment growth rate
*splice! Use reconstructed NAICS in 2000 to create 2000-2001 growth rate
	*our reconstructed based on SIC growth rates 
g us_emp_gr_naic = 100*((us_emp_naic/L4.us_emp_naic)-1)
g us_emp_gr_sic = 100*((us_emp_sic/L4.us_emp_sic)-1)

g us_emp_gr = us_emp_gr_sic if year <= 2000
replace us_emp_gr = us_emp_gr_naic if year > 2001

forval q = 1/4 {
	replace us_emp_gr = 100*((us_emp_naic/L4.us_emp_for)-1) if year == 2001 & qtr == `q'
}

g us_emp_restr_gr_naic = 100*((us_emp_restr_naic/L4.us_emp_restr_naic)-1)
g us_emp_restr_gr_sic = 100*((us_emp_restr_sic/L4.us_emp_restr_sic)-1)

g us_emp_restr_gr = us_emp_restr_gr_sic if year <= 2000
replace us_emp_restr_gr = us_emp_restr_gr_naic if year > 2001

forval q = 1/4 {
	replace us_emp_restr_gr = ((us_emp_restr_naic/L4.total_emp_for_restr)-1)*100 if year == 2001 & qtr == `q'
}

/////
*weight national industry growth rates by lagged share of employment of industry in each MSA
g us_emp_gr_w = L.share*us_emp_gr
g us_emp_restr_gr_w = L.share*us_emp_restr_gr

*sum up all of the growth rates to get the overall MSA growth rate
bys CBSA10 time: egen bartik_empgr_for_msa = sum(us_emp_gr_w)
bys CBSA10 time: egen bartik_empgr_restr_for_msa = sum(us_emp_restr_gr_w)

replace bartik_empgr_for_msa = . if time < 97
replace bartik_empgr_restr_for_msa = . if time < 97
	*first time period, becomes zero but actually missing

la var bartik_empgr_for_msa "Bartik Empl. Growth"
la var bartik_empgr_restr_for_msa "Bartik Empl. Growth (restricted)"

*lag 
xtset index time 
g l_bartik_empgr_for_msa = L.bartik_empgr_for_msa
replace bartik_empgr_for_msa = l_bartik_empgr_for_msa
g l_bartik_empgr_restr_for_msa = L.bartik_empgr_restr_for_msa
replace bartik_empgr_restr_for_msa = l_bartik_empgr_restr_for_msa

*collapse to MSA-time level dataset
collapse (max) bartik_empgr_for_msa bartik_empgr_restr_for_msa, by(CBSA10 year qtr time)

save `data'/bartik_msa_weighted_for, replace




***************ANNUAL**************************
**************Forward measure******************

use `data'/bartik_msa_merged, clear

g year = yofd(dofq(time))
g qtr = quarter(dofq(time))

*make annual
collapse (mean) total_emp_for total_emp_naic total_emp_sic, by(year CBSA10 ind_agg)

*aggregate by industry to get full employment measure in the msa
g sum_emp = .
bys CBSA10 year: ereplace sum_emp = sum(total_emp_for)

*generate share of industry employment of all employment in msa 
g share = total_emp_for / sum_emp

*employment shares should sum to 1 within a msa and time 
bys CBSA10 year: egen check = sum(share)

tempfile CBSA10
save `CBSA10'

/////
*create national industry employment measure
use `data'/bartik_nat_merged, clear

rename date_quarter time
g year = yofd(dofq(time))

rename total_emp_naic us_emp_naic
rename total_emp_sic us_emp_sic
rename total_emp_for us_emp_for
rename total_emp_back us_emp_back

keep year ind_agg us_emp_naic us_emp_sic us_emp_for us_emp_back

*make annual
collapse (mean) us_emp_naic us_emp_sic us_emp_for us_emp_back , by(year ind_agg)

merge 1:m ind_agg year using `CBSA10'
keep if _merge == 3 /*time issue*/
drop _merge

*create national industry employment for a MSA that excludes employment in that MSA
bys CBSA10 ind_agg year: g us_emp_restr_naic = us_emp_naic - total_emp_naic
bys CBSA10 ind_agg year: g us_emp_restr_sic = us_emp_sic - total_emp_sic

*reconstructed version for growth splice 
bys CBSA10 ind_agg year: g total_emp_for_restr = us_emp_for - total_emp_for

egen index =  group(CBSA10 ind_agg)
xtset index year

*national industry employment growth rate
*splice! Use reconstructed NAICS in 2000 to create 2000-2001 growth rate
	*our reconstructed based on SIC growth rates 
g us_emp_gr_naic = 100*((us_emp_naic/L.us_emp_naic)-1)
g us_emp_gr_sic = 100*((us_emp_sic/L.us_emp_sic)-1)

g us_emp_gr = us_emp_gr_sic if year <= 2000
replace us_emp_gr = us_emp_gr_naic if year > 2001

replace us_emp_gr = 100*((us_emp_naic/L.us_emp_for)-1) if year == 2001 

g us_emp_restr_gr_naic = 100*((us_emp_restr_naic/L.us_emp_restr_naic)-1)
g us_emp_restr_gr_sic = 100*((us_emp_restr_sic/L.us_emp_restr_sic)-1)

g us_emp_restr_gr = us_emp_restr_gr_sic if year <= 2000
replace us_emp_restr_gr = us_emp_restr_gr_naic if year > 2001

replace us_emp_restr_gr = 100*((us_emp_restr_naic/L.total_emp_for_restr)-1) if year == 2001 

/////
*weight national industry growth rates by lagged share of employment of industry in each MSA
g us_emp_gr_w = L.share*us_emp_gr
g us_emp_restr_gr_w = L.share*us_emp_restr_gr

*sum up all of the growth rates to get the overall MSA growth rate
bys CBSA10 year: egen bartik_empgr_for_msa = sum(us_emp_gr_w)
bys CBSA10 year: egen bartik_empgr_restr_for_msa = sum(us_emp_restr_gr_w)

replace bartik_empgr_for_msa = . if year < 1984
replace bartik_empgr_restr_for_msa = . if year < 1984
	*first time period, becomes zero but actually missing

la var bartik_empgr_for_msa "Bartik Empl. Growth"
la var bartik_empgr_restr_for_msa "Bartik Empl. Growth (restricted)"

*lag 
xtset index year 
g l_bartik_empgr_for_msa = L.bartik_empgr_for_msa
replace bartik_empgr_for_msa = l_bartik_empgr_for_msa
g l_bartik_empgr_restr_for_msa = L.bartik_empgr_restr_for_msa
replace bartik_empgr_restr_for_msa = l_bartik_empgr_restr_for_msa

*collapse to MSA-time level dataset
collapse (max) bartik_empgr_for_msa bartik_empgr_restr_for_msa, by(CBSA10 year)

save `data'/bartik_msa_weighted_for_annual, replace



///////////////////////////////////////////////////////////////////////////////
**************Quarterly*************************
**************Backwards measure******************

use `data'/bartik_msa_merged, clear

g year = yofd(dofq(time))
gen qtr = quarter(dofq(time))

*aggregate by industry to get full employment measure in the state
g sum_emp = .
bys CBSA10 time: ereplace sum_emp = sum(total_emp_back)

*generate share of industry employment of all employment in state 
g share = total_emp_back / sum_emp

*employment shares should sum to 1 within a state and time 
bys CBSA10 time: egen check = sum(share)

tempfile CBSA10
save `CBSA10'

/////
*create national industry employment measure
use `data'/bartik_nat_merged, clear

rename date_quarter time

rename total_emp_naic us_emp_naic
rename total_emp_sic us_emp_sic
rename total_emp_for us_emp_for
rename total_emp_back us_emp_back

keep time ind_agg us_emp_naic us_emp_sic us_emp_for us_emp_back

merge 1:m ind_agg time using `CBSA10'
keep if _merge ==3 
drop _merge

*create national industry employment for a MSA that excludes employment in that MSA
bys CBSA10 ind_agg time: g us_emp_restr_naic = us_emp_naic - total_emp_naic
bys CBSA10 ind_agg time: g us_emp_restr_sic = us_emp_sic - total_emp_sic

*reconstructed version for growth splice 
bys CBSA10 ind_agg time: g total_emp_back_restr = us_emp_back - total_emp_back

egen index =  group(CBSA10 ind_agg)
xtset index time

*national industry employment growth rate
*splice! Use reconstructed NAICS in 2000 to create 2000-2001 growth rate
	*our reconstructed based on SIC growth rates 
g us_emp_gr_naic = 100*((us_emp_naic/L4.us_emp_naic)-1)
g us_emp_gr_sic = 100*((us_emp_sic/L4.us_emp_sic)-1)

g us_emp_gr = us_emp_gr_sic if year <= 2000
replace us_emp_gr = us_emp_gr_naic if year > 2001

forval q = 1/4 {
	replace us_emp_gr = ((us_emp_naic/L4.us_emp_back)-1)*100 if year == 2001 & qtr == `q'
}

g us_emp_restr_gr_naic = 100*((us_emp_restr_naic/L4.us_emp_restr_naic)-1)
g us_emp_restr_gr_sic = 100*((us_emp_restr_sic/L4.us_emp_restr_sic)-1)

g us_emp_restr_gr = us_emp_restr_gr_sic if year <= 2000
replace us_emp_restr_gr = us_emp_restr_gr_naic if year > 2001

forval q = 1/4 {
	replace us_emp_restr_gr = ((us_emp_restr_naic/L4.total_emp_back_restr)-1)*100 if year == 2001 & qtr == `q'
}

 

/////
*weight national industry growth rates by lagged share of employment of industry in each MSA
g us_emp_gr_w = L.share*us_emp_gr
g us_emp_restr_gr_w = L.share*us_emp_restr_gr

*sum up all of the growth rates to get the overall MSA growth rate
bys CBSA10 time: egen bartik_empgr_back_msa  = sum(us_emp_gr_w)
bys CBSA10 time: egen bartik_empgr_restr_back_msa  = sum(us_emp_restr_gr_w)

replace bartik_empgr_back_msa  = . if time < 97
replace bartik_empgr_restr_back_msa  = . if time < 97
	*first time period, becomes zero but actually missing

la var bartik_empgr_back_msa "Bartik Empl. Growth"
la var bartik_empgr_restr_back_msa  "Bartik Empl. Growth (restricted)"

*lag 
xtset index time 
g l_bartik_empgr_back_msa = L.bartik_empgr_back_msa
replace bartik_empgr_back_msa = l_bartik_empgr_back_msa
g l_bartik_empgr_restr_back_msa = L.bartik_empgr_restr_back_msa
replace bartik_empgr_restr_back_msa = l_bartik_empgr_restr_back_msa

*collapse to MSA-time level dataset
collapse (max) bartik_empgr_back_msa bartik_empgr_restr_back_msa , by(CBSA10 year qtr time)

save `data'/bartik_msa_weighted_back, replace



**************ANNUAL*************************
**************Backwards measure******************

use `data'/bartik_msa_merged, clear

g year = yofd(dofq(time))
gen qtr = quarter(dofq(time))

*make annual
collapse (mean) total_emp_back total_emp_naic total_emp_sic, by(year CBSA10 ind_agg)

*aggregate by industry to get full employment measure in the state
g sum_emp = .
bys CBSA10 year: ereplace sum_emp = sum(total_emp_back)

*generate share of industry employment of all employment in state 
g share = total_emp_back / sum_emp

*employment shares should sum to 1 within a state and time 
bys CBSA10 year: egen check = sum(share)

tempfile CBSA10
save `CBSA10'

/////
*create national industry employment measure
use `data'/bartik_nat_merged, clear

rename date_quarter time
g year = yofd(dofq(time))

rename total_emp_naic us_emp_naic
rename total_emp_sic us_emp_sic
rename total_emp_for us_emp_for
rename total_emp_back us_emp_back

keep year ind_agg us_emp_naic us_emp_sic us_emp_for us_emp_back

*make annual
collapse (mean) us_emp_naic us_emp_sic us_emp_for us_emp_back, by(year ind_agg)

merge 1:m ind_agg year using `CBSA10'
keep if _merge ==3 
drop _merge

*create national industry employment for a MSA that excludes employment in that MSA
bys CBSA10 ind_agg year: g us_emp_restr_naic = us_emp_naic - total_emp_naic
bys CBSA10 ind_agg year: g us_emp_restr_sic = us_emp_sic - total_emp_sic

*reconstructed version for growth splice 
bys CBSA10 ind_agg year: g total_emp_back_restr = us_emp_back - total_emp_back

egen index =  group(CBSA10 ind_agg)
xtset index year

*national industry employment growth rate
*splice! Use reconstructed NAICS in 2000 to create 2000-2001 growth rate
g us_emp_gr_naic = 100*((us_emp_naic/L.us_emp_naic)-1)
g us_emp_gr_sic = 100*((us_emp_sic/L.us_emp_sic)-1)

g us_emp_gr = us_emp_gr_sic if year <= 2000
replace us_emp_gr = us_emp_gr_naic if year > 2001

replace us_emp_gr = 100*((us_emp_naic/L.us_emp_back)-1) if year == 2001 


g us_emp_restr_gr_naic = 100*((us_emp_restr_naic/L.us_emp_restr_naic)-1)
g us_emp_restr_gr_sic = 100*((us_emp_restr_sic/L.us_emp_restr_sic)-1)

g us_emp_restr_gr = us_emp_restr_gr_sic if year <= 2000
replace us_emp_restr_gr = us_emp_restr_gr_naic if year > 2001

replace us_emp_restr_gr = 100*((us_emp_restr_naic/L.total_emp_back_restr)-1) if year == 2001 

/////
*weight national industry growth rates by lagged share of employment of industry in each MSA
g us_emp_gr_w = L.share*us_emp_gr
g us_emp_restr_gr_w = L.share*us_emp_restr_gr

*sum up all of the growth rates to get the overall MSA growth rate
bys CBSA10 year: egen bartik_empgr_back_msa  = sum(us_emp_gr_w)
bys CBSA10 year: egen bartik_empgr_restr_back_msa  = sum(us_emp_restr_gr_w)

replace bartik_empgr_back_msa  = . if year < 1984
replace bartik_empgr_restr_back_msa  = . if year < 1984
	*first time period, becomes zero but actually missing

la var bartik_empgr_back_msa "Bartik Empl. Growth"
la var bartik_empgr_restr_back_msa  "Bartik Empl. Growth (restricted)"

*lag 
xtset index year 
g l_bartik_empgr_back_msa = L.bartik_empgr_back_msa
replace bartik_empgr_back_msa = l_bartik_empgr_back_msa
g l_bartik_empgr_restr_back_msa = L.bartik_empgr_restr_back_msa
replace bartik_empgr_restr_back_msa = l_bartik_empgr_restr_back_msa

*collapse to MSA-time level dataset
collapse (max) bartik_empgr_back_msa bartik_empgr_restr_back_msa , by(CBSA10 year)

save `data'/bartik_msa_weighted_back_annual, replace

}

}
